[PATCH] Remove ext4 xattr mbcache to fix CPU lockup when using ceph #107301

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

This is a followup on the discussions here:
https://bugzilla.kernel.org/show_bug.cgi?id=107301

The patch in comment 6 has been extensively tested on 11 ubuntu 14.04
with kernel 3.19 machines of our ceph cluster for about 10 days of heavy
ext4 xattr use without issue, without the patch same machines would
freeze every 4 to 24 hours depending on core count. The patch below is a
mechanical upgrade against the current linux tree, only the super.c
first hunk did not apply and was fixed by hand, kernel with defconfig
did build cleanly.

It would be great to have a working cache but it's not easy to do and
leaving the current code isn't nice to many ext4 xattr users out there
since their machine will at some point lockup: for us it worked for
about one year then we got suddenly lockup hell probably after some
limit was hit. 

The Lustre developpers have a patch to add a mount option to do the
mbcache removal as they had the same issue.

I assume maintainers of older kernels will consider backporting this
patch as well if it is accepted in current as the issue is present since
the beginning.

In case my mailer corrupts it the patch is here:

http://guerby.org/ftp/remove-ext4-mbcache-current.patch

Cc: Jan Kara <jack@xxxxxxxx>
Cc: Ted Ts'o <tytso@xxxxxxx>
Cc: Andreas Dilger <adilger.kernel@xxxxxxxxx>
Signed-off-by: Laurent GUERBY <laurent@xxxxxxxxxx>
---

--- linux/fs/ext4/ext4.h.orig	2015-11-20 09:32:30.995575644 +0100
+++ linux/fs/ext4/ext4.h	2015-11-20 09:37:53.880915440 +0100
@@ -1371,7 +1371,6 @@ struct ext4_sb_info {
 	struct list_head s_es_list;	/* List of inodes with reclaimable extents */
 	long s_es_nr_inode;
 	struct ext4_es_stats s_es_stats;
-	struct mb_cache *s_mb_cache;
 	spinlock_t s_es_lock ____cacheline_aligned_in_smp;
 
 	/* Ratelimit ext4 messages. */
--- linux/fs/ext4/super.c.orig	2015-11-20 09:32:31.007575695 +0100
+++ linux/fs/ext4/super.c	2015-11-20 09:39:11.373229371 +0100
@@ -55,7 +55,6 @@
 
 static struct ext4_lazy_init *ext4_li_info;
 static struct mutex ext4_li_mtx;
-static int ext4_mballoc_ready;
 static struct ratelimit_state ext4_mount_msg_ratelimit;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
@@ -857,10 +856,6 @@ static void ext4_put_super(struct super_
 		invalidate_bdev(sbi->journal_bdev);
 		ext4_blkdev_remove(sbi);
 	}
-	if (sbi->s_mb_cache) {
-		ext4_xattr_destroy_cache(sbi->s_mb_cache);
-		sbi->s_mb_cache = NULL;
-	}
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 	sb->s_fs_info = NULL;
@@ -3754,13 +3749,6 @@ static int ext4_fill_super(struct super_
 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
 
 no_journal:
-	if (ext4_mballoc_ready) {
-		sbi->s_mb_cache = ext4_xattr_create_cache(sb->s_id);
-		if (!sbi->s_mb_cache) {
-			ext4_msg(sb, KERN_ERR, "Failed to create an mb_cache");
-			goto failed_mount_wq;
-		}
-	}
 
 	if ((DUMMY_ENCRYPTION_ENABLED(sbi) || ext4_has_feature_encrypt(sb)) &&
 	    (blocksize != PAGE_CACHE_SIZE)) {
@@ -5267,8 +5255,6 @@ static int __init ext4_init_fs(void)
 	err = ext4_init_mballoc();
 	if (err)
 		goto out2;
-	else
-		ext4_mballoc_ready = 1;
 	err = init_inodecache();
 	if (err)
 		goto out1;
@@ -5284,7 +5270,6 @@ out:
 	unregister_as_ext3();
 	destroy_inodecache();
 out1:
-	ext4_mballoc_ready = 0;
 	ext4_exit_mballoc();
 out2:
 	ext4_exit_sysfs();
--- linux/fs/ext4/xattr.h.orig	2015-11-20 09:32:31.007575695 +0100
+++ linux/fs/ext4/xattr.h	2015-11-20 09:37:53.884915457 +0100
@@ -124,9 +124,6 @@ extern int ext4_xattr_ibody_inline_set(h
 				       struct ext4_xattr_info *i,
 				       struct ext4_xattr_ibody_find *is);
 
-extern struct mb_cache *ext4_xattr_create_cache(char *name);
-extern void ext4_xattr_destroy_cache(struct mb_cache *);
-
 #ifdef CONFIG_EXT4_FS_SECURITY
 extern int ext4_init_security(handle_t *handle, struct inode *inode,
 			      struct inode *dir, const struct qstr *qstr);
--- linux/fs/ext4/xattr.c.orig	2015-11-20 09:32:31.007575695 +0100
+++ linux/fs/ext4/xattr.c	2015-11-20 09:37:53.884915457 +0100
@@ -53,7 +53,6 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/slab.h>
-#include <linux/mbcache.h>
 #include <linux/quotaops.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -80,10 +79,6 @@
 # define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
-static void ext4_xattr_cache_insert(struct mb_cache *, struct buffer_head *);
-static struct buffer_head *ext4_xattr_cache_find(struct inode *,
-						 struct ext4_xattr_header *,
-						 struct mb_cache_entry **);
 static void ext4_xattr_rehash(struct ext4_xattr_header *,
 			      struct ext4_xattr_entry *);
 static int ext4_xattr_list(struct dentry *dentry, char *buffer,
@@ -114,9 +109,6 @@ const struct xattr_handler *ext4_xattr_h
 	NULL
 };
 
-#define EXT4_GET_MB_CACHE(inode)	(((struct ext4_sb_info *) \
-				inode->i_sb->s_fs_info)->s_mb_cache)
-
 static __le32 ext4_xattr_block_csum(struct inode *inode,
 				    sector_t block_nr,
 				    struct ext4_xattr_header *hdr)
@@ -278,7 +270,6 @@ ext4_xattr_block_get(struct inode *inode
 	struct ext4_xattr_entry *entry;
 	size_t size;
 	int error;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "name=%d.%s, buffer=%p, buffer_size=%ld",
 		  name_index, name, buffer, (long)buffer_size);
@@ -300,7 +291,6 @@ bad_block:
 		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	entry = BFIRST(bh);
 	error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1);
 	if (error == -EFSCORRUPTED)
@@ -425,7 +415,6 @@ ext4_xattr_block_list(struct dentry *den
 	struct inode *inode = d_inode(dentry);
 	struct buffer_head *bh = NULL;
 	int error;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 	ea_idebug(inode, "buffer=%p, buffer_size=%ld",
 		  buffer, (long)buffer_size);
@@ -447,7 +436,6 @@ ext4_xattr_block_list(struct dentry *den
 		error = -EFSCORRUPTED;
 		goto cleanup;
 	}
-	ext4_xattr_cache_insert(ext4_mb_cache, bh);
 	error = ext4_xattr_list_entries(dentry, BFIRST(bh), buffer, buffer_size);
 
 cleanup:
@@ -542,11 +530,8 @@ static void
 ext4_xattr_release_block(handle_t *handle, struct inode *inode,
 			 struct buffer_head *bh)
 {
-	struct mb_cache_entry *ce = NULL;
 	int error = 0;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
-	ce = mb_cache_entry_get(ext4_mb_cache, bh->b_bdev, bh->b_blocknr);
 	BUFFER_TRACE(bh, "get_write_access");
 	error = ext4_journal_get_write_access(handle, bh);
 	if (error)
@@ -555,8 +540,6 @@ ext4_xattr_release_block(handle_t *handl
 	lock_buffer(bh);
 	if (BHDR(bh)->h_refcount == cpu_to_le32(1)) {
 		ea_bdebug(bh, "refcount now=0; freeing");
-		if (ce)
-			mb_cache_entry_free(ce);
 		get_bh(bh);
 		unlock_buffer(bh);
 		ext4_free_blocks(handle, inode, bh, 0, 1,
@@ -564,8 +547,6 @@ ext4_xattr_release_block(handle_t *handl
 				 EXT4_FREE_BLOCKS_FORGET);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
-		if (ce)
-			mb_cache_entry_release(ce);
 		/*
 		 * Beware of this ugliness: Releasing of xattr block references
 		 * from different inodes can race and so we have to protect
@@ -778,17 +759,13 @@ ext4_xattr_block_set(handle_t *handle, s
 	struct super_block *sb = inode->i_sb;
 	struct buffer_head *new_bh = NULL;
 	struct ext4_xattr_search *s = &bs->s;
-	struct mb_cache_entry *ce = NULL;
 	int error = 0;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
 
 #define header(x) ((struct ext4_xattr_header *)(x))
 
 	if (i->value && i->value_len > sb->s_blocksize)
 		return -ENOSPC;
 	if (s->base) {
-		ce = mb_cache_entry_get(ext4_mb_cache, bs->bh->b_bdev,
-					bs->bh->b_blocknr);
 		BUFFER_TRACE(bs->bh, "get_write_access");
 		error = ext4_journal_get_write_access(handle, bs->bh);
 		if (error)
@@ -796,18 +773,12 @@ ext4_xattr_block_set(handle_t *handle, s
 		lock_buffer(bs->bh);
 
 		if (header(s->base)->h_refcount == cpu_to_le32(1)) {
-			if (ce) {
-				mb_cache_entry_free(ce);
-				ce = NULL;
-			}
 			ea_bdebug(bs->bh, "modifying in-place");
 			error = ext4_xattr_set_entry(i, s);
 			if (!error) {
 				if (!IS_LAST_ENTRY(s->first))
 					ext4_xattr_rehash(header(s->base),
 							  s->here);
-				ext4_xattr_cache_insert(ext4_mb_cache,
-					bs->bh);
 			}
 			unlock_buffer(bs->bh);
 			if (error == -EFSCORRUPTED)
@@ -823,10 +794,6 @@ ext4_xattr_block_set(handle_t *handle, s
 			int offset = (char *)s->here - bs->bh->b_data;
 
 			unlock_buffer(bs->bh);
-			if (ce) {
-				mb_cache_entry_release(ce);
-				ce = NULL;
-			}
 			ea_bdebug(bs->bh, "cloning");
 			s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
 			error = -ENOMEM;
@@ -863,37 +830,7 @@ ext4_xattr_block_set(handle_t *handle, s
 
 inserted:
 	if (!IS_LAST_ENTRY(s->first)) {
-		new_bh = ext4_xattr_cache_find(inode, header(s->base), &ce);
-		if (new_bh) {
-			/* We found an identical block in the cache. */
-			if (new_bh == bs->bh)
-				ea_bdebug(new_bh, "keeping");
-			else {
-				/* The old block is released after updating
-				   the inode. */
-				error = dquot_alloc_block(inode,
-						EXT4_C2B(EXT4_SB(sb), 1));
-				if (error)
-					goto cleanup;
-				BUFFER_TRACE(new_bh, "get_write_access");
-				error = ext4_journal_get_write_access(handle,
-								      new_bh);
-				if (error)
-					goto cleanup_dquot;
-				lock_buffer(new_bh);
-				le32_add_cpu(&BHDR(new_bh)->h_refcount, 1);
-				ea_bdebug(new_bh, "reusing; refcount now=%d",
-					le32_to_cpu(BHDR(new_bh)->h_refcount));
-				unlock_buffer(new_bh);
-				error = ext4_handle_dirty_xattr_block(handle,
-								      inode,
-								      new_bh);
-				if (error)
-					goto cleanup_dquot;
-			}
-			mb_cache_entry_release(ce);
-			ce = NULL;
-		} else if (bs->bh && s->base == bs->bh->b_data) {
+		if (bs->bh && s->base == bs->bh->b_data) {
 			/* We were modifying this block in-place. */
 			ea_bdebug(bs->bh, "keeping this block");
 			new_bh = bs->bh;
@@ -938,7 +875,6 @@ getblk_failed:
 			memcpy(new_bh->b_data, s->base, new_bh->b_size);
 			set_buffer_uptodate(new_bh);
 			unlock_buffer(new_bh);
-			ext4_xattr_cache_insert(ext4_mb_cache, new_bh);
 			error = ext4_handle_dirty_xattr_block(handle,
 							      inode, new_bh);
 			if (error)
@@ -955,8 +891,6 @@ getblk_failed:
 	error = 0;
 
 cleanup:
-	if (ce)
-		mb_cache_entry_release(ce);
 	brelse(new_bh);
 	if (!(bs->bh && s->base == bs->bh->b_data))
 		kfree(s->base);
@@ -1516,41 +1450,9 @@ cleanup:
 void
 ext4_xattr_put_super(struct super_block *sb)
 {
-	mb_cache_shrink(sb->s_bdev);
+	return;
 }
 
-/*
- * ext4_xattr_cache_insert()
- *
- * Create a new entry in the extended attribute cache, and insert
- * it unless such an entry is already in the cache.
- *
- * Returns 0, or a negative error number on failure.
- */
-static void
-ext4_xattr_cache_insert(struct mb_cache *ext4_mb_cache, struct buffer_head *bh)
-{
-	__u32 hash = le32_to_cpu(BHDR(bh)->h_hash);
-	struct mb_cache_entry *ce;
-	int error;
-
-	ce = mb_cache_entry_alloc(ext4_mb_cache, GFP_NOFS);
-	if (!ce) {
-		ea_bdebug(bh, "out of memory");
-		return;
-	}
-	error = mb_cache_entry_insert(ce, bh->b_bdev, bh->b_blocknr, hash);
-	if (error) {
-		mb_cache_entry_free(ce);
-		if (error == -EBUSY) {
-			ea_bdebug(bh, "already in cache");
-			error = 0;
-		}
-	} else {
-		ea_bdebug(bh, "inserting [%x]", (int)hash);
-		mb_cache_entry_release(ce);
-	}
-}
 
 /*
  * ext4_xattr_cmp()
@@ -1592,55 +1494,6 @@ ext4_xattr_cmp(struct ext4_xattr_header
 	return 0;
 }
 
-/*
- * ext4_xattr_cache_find()
- *
- * Find an identical extended attribute block.
- *
- * Returns a pointer to the block found, or NULL if such a block was
- * not found or an error occurred.
- */
-static struct buffer_head *
-ext4_xattr_cache_find(struct inode *inode, struct ext4_xattr_header *header,
-		      struct mb_cache_entry **pce)
-{
-	__u32 hash = le32_to_cpu(header->h_hash);
-	struct mb_cache_entry *ce;
-	struct mb_cache *ext4_mb_cache = EXT4_GET_MB_CACHE(inode);
-
-	if (!header->h_hash)
-		return NULL;  /* never share */
-	ea_idebug(inode, "looking for cached blocks [%x]", (int)hash);
-again:
-	ce = mb_cache_entry_find_first(ext4_mb_cache, inode->i_sb->s_bdev,
-				       hash);
-	while (ce) {
-		struct buffer_head *bh;
-
-		if (IS_ERR(ce)) {
-			if (PTR_ERR(ce) == -EAGAIN)
-				goto again;
-			break;
-		}
-		bh = sb_bread(inode->i_sb, ce->e_block);
-		if (!bh) {
-			EXT4_ERROR_INODE(inode, "block %lu read error",
-					 (unsigned long) ce->e_block);
-		} else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
-				EXT4_XATTR_REFCOUNT_MAX) {
-			ea_idebug(inode, "block %lu refcount %d>=%d",
-				  (unsigned long) ce->e_block,
-				  le32_to_cpu(BHDR(bh)->h_refcount),
-					  EXT4_XATTR_REFCOUNT_MAX);
-		} else if (ext4_xattr_cmp(header, BHDR(bh)) == 0) {
-			*pce = ce;
-			return bh;
-		}
-		brelse(bh);
-		ce = mb_cache_entry_find_next(ce, inode->i_sb->s_bdev, hash);
-	}
-	return NULL;
-}
 
 #define NAME_HASH_SHIFT 5
 #define VALUE_HASH_SHIFT 16
@@ -1709,18 +1562,3 @@ static void ext4_xattr_rehash(struct ext
 }
 
 #undef BLOCK_HASH_SHIFT
-
-#define	HASH_BUCKET_BITS	10
-
-struct mb_cache *
-ext4_xattr_create_cache(char *name)
-{
-	return mb_cache_create(name, HASH_BUCKET_BITS);
-}
-
-void ext4_xattr_destroy_cache(struct mb_cache *cache)
-{
-	if (cache)
-		mb_cache_destroy(cache);
-}
-


--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html



[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux