[PATCH 09/10 v5] ext4: convert unwritten extents from extent status tree in end_io

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



From: Zheng Liu <wenqing.lz@xxxxxxxxxx>

This commit tries to convert unwritten extents from extent status tree
in end_io callback functions and ext4_ext_direct_IO.

Signed-off-by: Zheng Liu <wenqing.lz@xxxxxxxxxx>
Cc: "Theodore Ts'o" <tytso@xxxxxxx>
Cc: Jan kara <jack@xxxxxxx>
---
 fs/ext4/extents.c           |   6 +-
 fs/ext4/extents_status.c    | 180 ++++++++++++++++++++++++++++++++++++++++----
 fs/ext4/extents_status.h    |   2 +
 fs/ext4/inode.c             |   5 ++
 fs/ext4/page-io.c           |   8 +-
 include/trace/events/ext4.h |  25 ++++++
 6 files changed, 208 insertions(+), 18 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9f21430..a03cabf 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4443,8 +4443,10 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
 			ret = PTR_ERR(handle);
 			break;
 		}
-		ret = ext4_map_blocks(handle, inode, &map,
-				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		down_write(&EXT4_I(inode)->i_data_sem);
+		ret = ext4_ext_map_blocks(handle, inode, &map,
+					  EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+		up_write(&EXT4_I(inode)->i_data_sem);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
 			ext4_msg(inode->i_sb, KERN_ERR,
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index bac5286..eab8893 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -248,10 +248,11 @@ ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
 	ext4_lblk_t ret = EXT_MAX_BLOCKS;
+	unsigned long flags;
 
 	trace_ext4_es_find_delayed_extent_enter(inode, es->es_lblk);
 
-	read_lock(&EXT4_I(inode)->i_es_lock);
+	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	tree = &EXT4_I(inode)->i_es_tree;
 
 	/* find extent in cache firstly */
@@ -291,7 +292,7 @@ out:
 		}
 	}
 
-	read_unlock(&EXT4_I(inode)->i_es_lock);
+	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	trace_ext4_es_find_delayed_extent_exit(inode, es, ret);
@@ -458,6 +459,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 {
 	struct extent_status newes;
 	ext4_lblk_t end = lblk + len - 1;
+	unsigned long flags;
 	int err = 0;
 
 	es_debug("add [%u/%u) %llu %llx to extent status tree of inode %lu\n",
@@ -471,14 +473,14 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
 	ext4_es_store_status(&newes, status);
 	trace_ext4_es_insert_extent(inode, &newes);
 
-	write_lock(&EXT4_I(inode)->i_es_lock);
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	err = __es_remove_extent(inode, lblk, end);
 	if (err != 0)
 		goto error;
 	err = __es_insert_extent(inode, &newes);
 
 error:
-	write_unlock(&EXT4_I(inode)->i_es_lock);
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	ext4_es_print_tree(inode);
@@ -498,13 +500,14 @@ int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es)
 	struct ext4_es_tree *tree;
 	struct extent_status *es1 = NULL;
 	struct rb_node *node;
+	unsigned long flags;
 	int found = 0;
 
 	trace_ext4_es_lookup_extent_enter(inode, es->es_lblk);
 	es_debug("lookup extent in block %u\n", es->es_lblk);
 
 	tree = &EXT4_I(inode)->i_es_tree;
-	read_lock(&EXT4_I(inode)->i_es_lock);
+	read_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 
 	/* find extent in cache firstly */
 	es->es_len = es->es_pblk = 0;
@@ -539,7 +542,7 @@ out:
 		es->es_pblk = es1->es_pblk;
 	}
 
-	read_unlock(&EXT4_I(inode)->i_es_lock);
+	read_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 
 	ext4_es_lru_add(inode);
 	trace_ext4_es_lookup_extent_exit(inode, es, found);
@@ -649,6 +652,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 			  ext4_lblk_t len)
 {
 	ext4_lblk_t end;
+	unsigned long flags;
 	int err = 0;
 
 	trace_ext4_es_remove_extent(inode, lblk, len);
@@ -658,9 +662,9 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 	end = lblk + len - 1;
 	BUG_ON(end < lblk);
 
-	write_lock(&EXT4_I(inode)->i_es_lock);
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
 	err = __es_remove_extent(inode, lblk, end);
-	write_unlock(&EXT4_I(inode)->i_es_lock);
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
 	ext4_es_print_tree(inode);
 	return err;
 }
@@ -671,6 +675,7 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 					struct ext4_sb_info, s_es_shrinker);
 	struct ext4_inode_info *ei;
 	struct list_head *cur, *tmp, scanned;
+	unsigned long flags;
 	int nr_to_scan = sc->nr_to_scan;
 	int ret, nr_shrunk = 0;
 
@@ -687,16 +692,16 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
 
-		read_lock(&ei->i_es_lock);
+		read_lock_irqsave(&ei->i_es_lock, flags);
 		if (ei->i_es_lru_nr == 0) {
-			read_unlock(&ei->i_es_lock);
+			read_unlock_irqrestore(&ei->i_es_lock, flags);
 			continue;
 		}
-		read_unlock(&ei->i_es_lock);
+		read_unlock_irqrestore(&ei->i_es_lock, flags);
 
-		write_lock(&ei->i_es_lock);
+		write_lock_irqsave(&ei->i_es_lock, flags);
 		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
-		write_unlock(&ei->i_es_lock);
+		write_unlock_irqrestore(&ei->i_es_lock, flags);
 
 		nr_shrunk += ret;
 		nr_to_scan -= ret;
@@ -756,14 +761,15 @@ static int ext4_es_reclaim_extents_count(struct super_block *sb)
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_inode_info *ei;
 	struct list_head *cur;
+	unsigned long flags;
 	int nr_cached = 0;
 
 	spin_lock(&sbi->s_es_lru_lock);
 	list_for_each(cur, &sbi->s_es_lru) {
 		ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
-		read_lock(&ei->i_es_lock);
+		read_lock_irqsave(&ei->i_es_lock, flags);
 		nr_cached += ei->i_es_lru_nr;
-		read_unlock(&ei->i_es_lock);
+		read_unlock_irqrestore(&ei->i_es_lock, flags);
 	}
 	spin_unlock(&sbi->s_es_lru_lock);
 	trace_ext4_es_reclaim_extents_count(sb, nr_cached);
@@ -801,3 +807,147 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 	tree->cache_es = NULL;
 	return nr_shrunk;
 }
+
+int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+				      size_t size)
+{
+	struct ext4_es_tree *tree;
+	struct rb_node *node;
+	struct extent_status *es, orig_es, conv_es;
+	ext4_lblk_t end, len1, len2;
+	ext4_lblk_t lblk = 0, len = 0;
+	ext4_fsblk_t block;
+	unsigned long flags;
+	unsigned int blkbits;
+	int err = 0;
+
+	trace_ext4_es_convert_unwritten_extents(inode, offset, size);
+	blkbits = inode->i_blkbits;
+	lblk = offset >> blkbits;
+	len = (EXT4_BLOCK_ALIGN(offset + size, blkbits) >> blkbits) - lblk;
+
+	end = lblk + len - 1;
+	BUG_ON(end < lblk);
+
+	tree = &EXT4_I(inode)->i_es_tree;
+
+	write_lock_irqsave(&EXT4_I(inode)->i_es_lock, flags);
+	es = __es_tree_search(&tree->root, lblk);
+	if (!es)
+		goto out;
+	if (es->es_lblk > end)
+		goto out;
+
+	tree->cache_es = NULL;
+
+	orig_es.es_lblk = es->es_lblk;
+	orig_es.es_len = es->es_len;
+	orig_es.es_pblk = es->es_pblk;
+
+	len1 = lblk > es->es_lblk ? lblk - es->es_lblk : 0;
+	len2 = ext4_es_end(es) > end ?
+	       ext4_es_end(es) - end : 0;
+	if (len1 > 0)
+		es->es_len = len1;
+	if (len2 > 0) {
+		if (len1 > 0) {
+			struct extent_status newes;
+
+			newes.es_lblk = end + 1;
+			newes.es_len = len2;
+			block = ext4_es_pblock(&orig_es) +
+				orig_es.es_len - len2;
+			ext4_es_store_pblock(&newes, block);
+			ext4_es_store_status(&newes, ext4_es_status(&orig_es));
+			err = __es_insert_extent(inode, &newes);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+				goto out;
+			}
+
+			conv_es.es_lblk = orig_es.es_lblk + len1;
+			conv_es.es_len = orig_es.es_len - len1 - len2;
+			block = ext4_es_pblock(&orig_es) + len1;
+			ext4_es_store_pblock(&conv_es, block);
+			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+			err = __es_insert_extent(inode, &conv_es);
+			if (err) {
+				int err2 = __es_remove_extent(inode,
+							conv_es.es_lblk,
+							ext4_es_end(&newes));
+				if (err2)
+					goto out;
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+				goto out;
+			}
+		} else {
+			es->es_lblk = end + 1;
+			es->es_len = len2;
+			block = ext4_es_pblock(&orig_es) +
+				orig_es.es_len - len2;
+			ext4_es_store_pblock(es, block);
+
+			conv_es.es_lblk = orig_es.es_lblk;
+			conv_es.es_len = orig_es.es_len - len2;
+			ext4_es_store_pblock(&conv_es,
+					     ext4_es_pblock(&orig_es));
+			ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+			err = __es_insert_extent(inode, &conv_es);
+			if (err) {
+				es->es_lblk = orig_es.es_lblk;
+				es->es_len = orig_es.es_len;
+				es->es_pblk = orig_es.es_pblk;
+			}
+		}
+		goto out;
+	}
+
+	if (len1 > 0) {
+		node = rb_next(&es->rb_node);
+		if (node)
+			es = rb_entry(node, struct extent_status, rb_node);
+		else
+			es = NULL;
+	}
+
+	while (es && ext4_es_end(es) <= end) {
+		node = rb_next(&es->rb_node);
+		ext4_es_store_status(es, EXTENT_STATUS_WRITTEN);
+		if (!inode) {
+			es = NULL;
+			break;
+		}
+		es = rb_entry(node, struct extent_status, rb_node);
+	}
+
+	if (es && es->es_lblk < end + 1) {
+		ext4_lblk_t orig_len = es->es_len;
+
+		/*
+		 * Here we first set conv_es just because of avoiding copy the
+		 * value of es to a temporary variable.
+		 */
+		len1 = ext4_es_end(es) - end;
+		conv_es.es_lblk = es->es_lblk;
+		conv_es.es_len = es->es_len - len1;
+		ext4_es_store_pblock(&conv_es, ext4_es_pblock(es));
+		ext4_es_store_status(&conv_es, EXTENT_STATUS_WRITTEN);
+
+		es->es_lblk = end + 1;
+		es->es_len = len1;
+		block = ext4_es_pblock(es) + orig_len - len1;
+		ext4_es_store_pblock(es, block);
+
+		err = __es_insert_extent(inode, &conv_es);
+		if (err)
+			goto out;
+	}
+
+out:
+	write_unlock_irqrestore(&EXT4_I(inode)->i_es_lock, flags);
+	return err;
+}
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 938ad2b..2849d74 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -54,6 +54,8 @@ extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
 extern ext4_lblk_t ext4_es_find_delayed_extent(struct inode *inode,
 					       struct extent_status *es);
 extern int ext4_es_lookup_extent(struct inode *inode, struct extent_status *es);
+extern int ext4_es_convert_unwritten_extents(struct inode *inode, loff_t offset,
+					     size_t size);
 
 static inline int ext4_es_is_written(struct extent_status *es)
 {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 670779a..08cf720 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3063,6 +3063,7 @@ out:
 		io_end->result = ret;
 	}
 
+	ext4_es_convert_unwritten_extents(inode, offset, size);
 	ext4_add_complete_io(io_end);
 }
 
@@ -3088,6 +3089,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
 	 */
 	inode = io_end->inode;
 	ext4_set_io_unwritten_flag(inode, io_end);
+	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
 	ext4_add_complete_io(io_end);
 out:
 	bh->b_private = NULL;
@@ -3246,6 +3248,9 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 	} else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
 		int err;
+		err = ext4_es_convert_unwritten_extents(inode, offset, ret);
+		if (err)
+			ret = err;
 		/*
 		 * for non AIO case, since the IO is already
 		 * completed, we could do the conversion right here
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0016fbc..66ea30e 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -276,6 +276,13 @@ static void ext4_end_bio(struct bio *bio, int error)
 		error = 0;
 	bio_put(bio);
 
+	/*
+	 * We need to convert unwrittne extents in extent status tree before
+	 * end_page_writeback() is called.  Otherwise, when dioread_nolock is
+	 * enabled, we will be likely to read stale data.
+	 */
+	inode = io_end->inode;
+	ext4_es_convert_unwritten_extents(inode, io_end->offset, io_end->size);
 	for (i = 0; i < io_end->num_io_pages; i++) {
 		struct page *page = io_end->pages[i]->p_page;
 		struct buffer_head *bh, *head;
@@ -305,7 +312,6 @@ static void ext4_end_bio(struct bio *bio, int error)
 		put_io_page(io_end->pages[i]);
 	}
 	io_end->num_io_pages = 0;
-	inode = io_end->inode;
 
 	if (error) {
 		io_end->flag |= EXT4_IO_END_ERROR;
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index f0734b3..d32e3d5 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -2233,6 +2233,31 @@ TRACE_EVENT(ext4_es_lookup_extent_exit,
 		  __entry->found ? __entry->status : 0)
 );
 
+TRACE_EVENT(ext4_es_convert_unwritten_extents,
+	TP_PROTO(struct inode *inode, loff_t offset, loff_t size),
+
+	TP_ARGS(inode, offset, size),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	loff_t,	offset			)
+		__field(	loff_t, size			)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->offset	= offset;
+		__entry->size	= size;
+	),
+
+	TP_printk("dev %d,%d ino %lu convert unwritten extents [%llu/%llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->ino,
+		  __entry->offset, __entry->size)
+);
+
 TRACE_EVENT(ext4_es_reclaim_extents_count,
 	TP_PROTO(struct super_block *sb, int nr_cached),
 
-- 
1.7.12.rc2.18.g61b472e

--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux