Ext4: Set file system to read-only by I/O error threshold

stufever@xxxxxxxxx · Fri, 5 Aug 2011 15:45:04 +0800

From: Wang Shaoyan <wangshaoyan.pt@xxxxxxxxxx>

Some version of Hadoop uses access(2) to check whether the data chunk harddisk is online, if access(2) returns error, hadoop marks the disk which it called access(2) to as offline. This method works for Ext3/4 with journal, because when jbd/jbd2 encounters I/O error, the file system will be set as read-only. But for Ext4 no-journal mode, there is no jdb2 to set the file system as read-only when I/O error happens, the access(2) from Hadoop is not able to reliably detect hard disk offline condition.

This patch tries to fix the above problem from kernel side. A counter is used to record the total I/O error numbers in a block device. When I/O error counter reach threshold, all fs which register eio_handler on the block device will be set as read-only. People can set I/O error threshold, in 2 conditions fs will be set as read-only:
1) inside the sampling interval, I/O errors reach pre-set threshold happens
2) I/O errors always happen in continous sampling intervals, the sum of errors reach pre-set threshold

Then the application can find the file system is set as read-only, and call its own failure tolerance procedures.

There are 2 interface exported to user space via sysfs:
/sys/block/sd[?]/eio/threshold --- I/O error threshold
/sys/block/sd[?]/eio/interval  --- sampling interval in second

If default value of threshold is 0, everything happens as before.

Signed-off-by: Wang Shaoyan <wangshaoyan.pt@xxxxxxxxxx>
---
 block/genhd.c      |   62 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/bio.c           |   34 +++++++++++++++++++++++++++++
 fs/block_dev.c     |   16 +++++++++++++
 fs/ext4/ext4.h     |    3 ++
 fs/ext4/super.c    |   38 ++++++++++++++++++++++++++++++++
 include/linux/fs.h |   16 +++++++++++++
 6 files changed, 169 insertions(+)

--- a/block/genhd.c
+++ b/block/genhd.c
@@ -872,6 +872,13 @@ static ssize_t disk_discard_alignment_sh
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
+static ssize_t eio_show(struct device *dev,
+			struct device_attribute *attr,
+			char *buf);
+static ssize_t eio_store(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf, size_t count);
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
@@ -892,6 +899,10 @@ static struct device_attribute dev_attr_
 	__ATTR(io-timeout-fail,  S_IRUGO|S_IWUSR, part_timeout_show,
 		part_timeout_store);
 #endif
+static DEVICE_ATTR(threshold, S_IRUGO|S_IWUSR, eio_show,
+		   eio_store);
+static DEVICE_ATTR(interval, S_IRUGO|S_IWUSR, eio_show,
+		   eio_store);
 
 static struct attribute *disk_attrs[] = {
 	&dev_attr_range.attr,
@@ -913,15 +924,66 @@ static struct attribute *disk_attrs[] = 
 	NULL
 };
 
+static struct attribute *eio_attrs[] = {
+	&dev_attr_threshold.attr,
+	&dev_attr_interval.attr,
+	NULL
+};
 static struct attribute_group disk_attr_group = {
 	.attrs = disk_attrs,
 };
+static struct attribute_group eio_attr_group = {
+	.name = "eio",
+	.attrs = eio_attrs,
+};
 
 static const struct attribute_group *disk_attr_groups[] = {
 	&disk_attr_group,
+	&eio_attr_group,
 	NULL
 };
 
+static ssize_t eio_show(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct block_device *bdev = bdget_disk(disk, 0);
+	struct eio *eio = &bdev->bd_eio;
+	int ret = -EINVAL;
+
+	if (attr == &dev_attr_interval)
+		ret = sprintf(buf, "%u\n", eio->interval);
+	else if (attr == &dev_attr_threshold)
+		ret = sprintf(buf, "%u\n", eio->threshold);
+
+	return ret;
+}
+
+static ssize_t eio_store(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct block_device *bdev = bdget_disk(disk, 0);
+	struct eio *eio = &bdev->bd_eio;
+	unsigned long t;
+	char *endp;
+
+	t = simple_strtoul(skip_spaces(buf), &endp, 0);
+	endp = skip_spaces(endp);
+	if (*endp || t > 0xffffffff)
+		return -EINVAL;
+
+	if (attr == &dev_attr_interval) {
+		if (t <= 0)
+			return -EINVAL;
+		eio->interval = t;
+	} else if (attr == &dev_attr_threshold)
+		eio->threshold = t;
+
+	return count;
+}
 static void disk_free_ptbl_rcu_cb(struct rcu_head *head)
 {
 	struct disk_part_tbl *ptbl =
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1408,6 +1408,35 @@ void bio_check_pages_dirty(struct bio *b
 	}
 }
 
+static void bio_error_handler(struct block_device *bdev)
+{
+	struct eio *eio = &bdev->bd_contains->bd_eio;
+	struct eio_handler *eio_handler;
+	unsigned long time;
+	unsigned int threshold = eio->threshold;
+
+	if (threshold == 0) {
+		return;
+	}
+
+	spin_lock(&eio->lock);
+	time = jiffies;
+	if (time_after(eio->last_jiffies + eio->interval * HZ, time))
+		eio->counter++;
+	else
+		eio->counter = 1;
+	eio->last_jiffies = time;
+	printk(KERN_ERR "bio error counter: %u\n", eio->counter);
+
+	if (eio->counter >= threshold) {
+		list_for_each_entry(eio_handler, &eio->head, list) {
+			if (eio_handler->handler)
+				eio_handler->handler(eio_handler);
+		}
+	}
+	spin_unlock(&eio->lock);
+}
+
 /**
  * bio_endio - end I/O on a bio
  * @bio:	bio
@@ -1429,6 +1458,11 @@ void bio_endio(struct bio *bio, int erro
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 
+	if (error) {
+		if (bio->bi_bdev)
+			bio_error_handler(bio->bi_bdev);
+	}
+
 	if (bio->bi_end_io)
 		bio->bi_end_io(bio, error);
 }
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -29,6 +29,9 @@
 #include <asm/uaccess.h>
 #include "internal.h"
 
+#define DEFAULT_EIO_THRESHOLD	10
+#define DEFAULT_EIO_INTERVAL	5
+
 struct bdev_inode {
 	struct block_device bdev;
 	struct inode vfs_inode;
@@ -453,6 +456,7 @@ static void init_once(void *foo)
 {
 	struct bdev_inode *ei = (struct bdev_inode *) foo;
 	struct block_device *bdev = &ei->bdev;
+	struct eio *eio = &bdev->bd_eio;
 
 	memset(bdev, 0, sizeof(*bdev));
 	mutex_init(&bdev->bd_mutex);
@@ -461,6 +465,8 @@ static void init_once(void *foo)
 #ifdef CONFIG_SYSFS
 	INIT_LIST_HEAD(&bdev->bd_holder_list);
 #endif
+	spin_lock_init(&eio->lock);
+	INIT_LIST_HEAD(&eio->head);
 	inode_init_once(&ei->vfs_inode);
 	/* Initialize mutex for freeze. */
 	mutex_init(&bdev->bd_fsfreeze_mutex);
@@ -557,6 +563,7 @@ struct block_device *bdget(dev_t dev)
 {
 	struct block_device *bdev;
 	struct inode *inode;
+	struct eio *eio;
 
 	inode = iget5_locked(blockdev_superblock, hash(dev),
 			bdev_test, bdev_set, &dev);
@@ -565,6 +572,7 @@ struct block_device *bdget(dev_t dev)
 		return NULL;
 
 	bdev = &BDEV_I(inode)->bdev;
+	eio = &bdev->bd_eio;
 
 	if (inode->i_state & I_NEW) {
 		bdev->bd_contains = NULL;
@@ -580,6 +588,9 @@ struct block_device *bdget(dev_t dev)
 		spin_lock(&bdev_lock);
 		list_add(&bdev->bd_list, &all_bdevs);
 		spin_unlock(&bdev_lock);
+		/* eio->counter and eio->threshold are set to 0 in memset */
+		eio->threshold = DEFAULT_EIO_THRESHOLD;
+		eio->interval = DEFAULT_EIO_INTERVAL;
 		unlock_new_inode(inode);
 	}
 	return bdev;
@@ -1379,6 +1390,11 @@ static int __blkdev_put(struct block_dev
 	}
 	if (!bdev->bd_openers) {
 		struct module *owner = disk->fops->owner;
+		if (bdev->bd_contains == bdev) {
+			struct eio *eio = &bdev->bd_contains->bd_eio;
+
+			WARN_ON(!list_empty(&eio->head));
+		}
 
 		put_disk(disk);
 		module_put(owner);
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1114,6 +1114,9 @@ struct ext4_sb_info {
 
 	/* workqueue for dio unwritten */
 	struct workqueue_struct *dio_unwritten_wq;
+
+	/* IO error handler when counter reach threshold */
+	struct eio_handler s_eio_handler;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,6 +334,18 @@ static void ext4_handle_error(struct sup
 			sb->s_id);
 }
 
+static void ext4_eio_handler(struct eio_handler *handler)
+{
+	struct super_block *sb = handler->bdev->bd_super;
+
+	if (sb->s_flags & MS_RDONLY) {
+		return;
+	}
+
+	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
+	sb->s_flags |= MS_RDONLY;
+}
+
 void __ext4_error(struct super_block *sb, const char *function,
 		const char *fmt, ...)
 {
@@ -640,6 +652,14 @@ static void ext4_put_super(struct super_
 	if (sb->s_dirt)
 		ext4_commit_super(sb, 1);
 
+	if (!sbi->s_journal) {
+		struct eio_handler *eio_handler = &sbi->s_eio_handler;
+		struct eio *eio = &sb->s_bdev->bd_contains->bd_eio;
+		spin_lock_bh(&eio->lock);
+		list_del_init(&eio_handler->list);
+		spin_unlock_bh(&eio->lock);
+	}
+
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -2417,6 +2437,8 @@ static int ext4_fill_super(struct super_
 	__u64 blocks_count;
 	int err;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	struct eio *eio;
+	struct eio_handler *eio_handler;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	if (!sbi)
@@ -3008,6 +3030,22 @@ no_journal:
 
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
 
+	/*
+	 * io error handler only support no journal, because journal
+	 * will do the same thing
+	 */
+	if (!EXT4_SB(sb)->s_journal) {
+		eio = &sb->s_bdev->bd_contains->bd_eio;
+		eio_handler = &sbi->s_eio_handler;
+		INIT_LIST_HEAD(&eio_handler->list);
+		eio_handler->handler = ext4_eio_handler;
+		eio_handler->bdev = sb->s_bdev;
+
+		spin_lock_bh(&eio->lock);
+		list_add(&eio_handler->list, &eio->head);
+		spin_unlock_bh(&eio->lock);
+	}
+
 	lock_kernel();
 	return 0;
 
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -678,6 +678,20 @@ struct address_space {
 	 * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
 	 */
 
+struct eio {
+	unsigned int threshold;
+	unsigned int counter;
+	unsigned long last_jiffies;
+	unsigned int interval;
+	spinlock_t lock;
+	struct list_head head;
+};
+struct eio_handler {
+	struct list_head list;
+	void (*handler)(struct eio_handler *eio_handler);
+	struct block_device *bdev;
+};
+
 struct block_device {
 	dev_t			bd_dev;  /* not a kdev_t - it's a search key */
 	struct inode *		bd_inode;	/* will die */
@@ -709,6 +723,8 @@ struct block_device {
 	int			bd_fsfreeze_count;
 	/* Mutex for freeze */
 	struct mutex		bd_fsfreeze_mutex;
+
+	struct eio		bd_eio;
 };
 
 /*
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html