This patch adds operations on a delayed extent tree. Signed-off-by; Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> --- fs/ext4/Makefile | 2 +- fs/ext4/delayed_extents.c | 412 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/delayed_extents.h | 18 ++ 3 files changed, 431 insertions(+), 1 deletions(-) create mode 100644 fs/ext4/delayed_extents.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f86..ee16ad3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o delayed_extents.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c new file mode 100644 index 0000000..8da7b78 --- /dev/null +++ b/fs/ext4/delayed_extents.c @@ -0,0 +1,412 @@ +/* + * fs/ext4/delayed_extents.c + * + * Written by Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> + * + * Ext4 delayed extents core functions. + */ +#include <linux/rbtree.h> +#include "ext4.h" +#include "delayed_extents.h" +#include "ext4_extents.h" + +/* + * delayed extent implementation for ext4. + * + * + * ========================================================================== + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking up + * page cache, this has several deficiencies - complicated, buggy, and + * inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need to know if + * a block or a range of blocks are belonged to a delayed extent. + * + * Let us have a look at how they do without delayed extents implementation. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is already + * under delayed allocation or not to determine whether quota reserving + * is needed for the cluster. + * + * -- punch hole + * punch hole looks up page cache to identify a delayed extent. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is mapped, If + * there are not very many delayed buffers, then it is time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, bigalloc and + * writeout can figure out if a block or a range of blocks is under delayed + * allocation(belonged to a delayed extent) or not by searching the delayed + * extent tree. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + * -- delayed extent + * A delayed extent is a range of blocks which are contiguous logically and + * under delayed allocation. Unlike extent in ext4, delayed extent in ext4 + * is a in-memory struct, there is no corresponding on-disk data. There is + * no limit on length of delayed extent, so a delayed extent can contain as + * many blocks as they are contiguous logically. + * + * -- delayed extent tree + * Every inode has a delayed extent tree and all under delayed allocation + * blocks are added to the tree as dealyed extents. Delayed extents in + * the tree are ordered by logical block no. + * + * -- operations on a delayed extent tree + * There are three operations on a delayed extent tree: find next delayed + * extent, adding a space(a range of blocks) and removing a space. + * + * -- race on a delayed extent tree + * Delayed extent tree is protected inode->i_data_sem like extent tree. + * + * + * ========================================================================== + * 3. performance analysis + * -- overhead + * 1. Apart from operations on a delayed extent tree, we need to + * down_write(inode->i_data_sem) in delayed write path to maintain delayed + * extent tree, this can have impact on parallel read-write and write-write + * + * 2. There is a cache extent for write access, so if writes are not very + * random, adding space operaions are in O(1) time. + * + * -- gain + * 3. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * 4. TODO + * -- cache_de in de_find_extent + * de_find_extent is used heavily by bgalloc, so de_find_extent should use + * cache_de. + * + */ +static struct kmem_cache *ext4_de_cachep; + +int __init ext4_init_de(void) +{ + ext4_de_cachep = KMEM_CACHE(delayed_extent, SLAB_RECLAIM_ACCOUNT); + if (ext4_de_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_de(void) +{ + if (ext4_de_cachep) + kmem_cache_destroy(ext4_de_cachep); +} + +void ext4_de_init_tree(struct ext4_de_tree *tree) +{ + tree->root = RB_ROOT; + tree->cache_de = NULL; +} + +#ifdef DE_DEBUG +static void ext4_de_print_tree(struct inode *inode) +{ + struct ext4_de_tree *tree; + struct rb_node *node; + + printk(KERN_DEBUG "delayed extents for inode %lu:", inode->i_ino); + tree = &EXT4_I(inode)->i_de_tree; + node = rb_first(&tree->root); + while(node) { + struct delayed_extent *de; + de = rb_entry(node, struct delayed_extent, rb_node); + printk(KERN_DEBUG " [%u/%u)", de->start, de->len); + node = rb_next(node); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_de_print_tree(inode) +#endif + +static inline ext4_lblk_t delayed_extent_end(struct delayed_extent *de) +{ + if (de->start + de->len < de->start) + return (ext4_lblk_t)-1; + return de->start + de->len; +} + +/* + * search through the tree for an delayed_extent with a given offset. If + * it can't be found, try to find next extent. + */ +static struct delayed_extent * __de_tree_search(struct rb_root *root, + ext4_lblk_t offset) +{ + struct rb_node *node = root->rb_node; + struct delayed_extent *de = NULL; + + while (node) { + de = rb_entry(node, struct delayed_extent, rb_node); + if (offset < de->start) + node = node->rb_left; + else if (offset >= delayed_extent_end(de)) + node = node->rb_right; + else + return de; + } + + if (de && offset < de->start) + return de; + + if (de && offset >= delayed_extent_end(de)) + return rb_entry(rb_next(&de->rb_node), + struct delayed_extent, rb_node); + + return NULL; +} + +/* + * ext4_de_first_extent_since: find the 1st delayed extent covering @start + * if it exists, otherwise, the next extent after @start. + * + * @inode: the inode which owns delayed extents + * @offset: logic block + * + * Returns next block beyond the found extent. + * Delayed extent is returned via @de. + */ +ext4_lblk_t ext4_de_find_extent(struct inode *inode, struct delayed_extent *de) +{ + struct ext4_de_tree *tree; + struct delayed_extent *de1; + struct rb_node *node; + + de->len = 0; + tree = &EXT4_I(inode)->i_de_tree; + de1 = __de_tree_search(&tree->root, de->start); + if (de1) { + de->start = de1->start; + de->len = de1->len; + node = rb_next(&de1->rb_node); + if (node) { + de1 = rb_entry(node, struct delayed_extent, rb_node); + return de1->start; + } + } + + return EXT_MAX_BLOCKS; +} + +static struct delayed_extent * +ext4_de_alloc_extent(ext4_lblk_t start, ext4_lblk_t len) +{ + struct delayed_extent *de; + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS); + if (de == NULL) + return NULL; + de->start = start; + de->len = len; + return de; +} + +static void ext4_de_free_extent(struct delayed_extent *de) +{ + kmem_cache_free(ext4_de_cachep, de); +} + +static void ext4_de_try_to_merge_left(struct ext4_de_tree *tree, + struct delayed_extent *de) +{ + struct delayed_extent *de1; + struct rb_node *node; + + node = rb_prev(&de->rb_node); + if (!node) + return; + + de1 = rb_entry(node, struct delayed_extent, rb_node); + if (delayed_extent_end(de1) == de->start) { + de1->len += de->len; + rb_erase(&de->rb_node, &tree->root); + if (de == tree->cache_de) + tree->cache_de = de1; + ext4_de_free_extent(de); + } +} + +static void ext4_de_try_to_merge_right(struct ext4_de_tree *tree, + struct delayed_extent *de) +{ + struct delayed_extent *de1; + struct rb_node *node; + + node = rb_next(&de->rb_node); + if (!node) + return; + + de1 = rb_entry(node, struct delayed_extent, rb_node); + if (de1->start == delayed_extent_end(de)) { + de->len += de1->len; + rb_erase(node, &tree->root); + if (de1 == tree->cache_de) + tree->cache_de = de; + ext4_de_free_extent(de1); + } +} + +/* + * ext4_de_add_space adds a space to a delayed extent tree. + * Caller holds inode->i_data_sem. + * + * ext4_de_add_space is callyed by ext4_dealyed_write_begin and + * ext4_de_remove_space. + * + * Return 0 on success, error code on failure. + */ +int ext4_de_add_space(struct inode *inode, ext4_lblk_t offset, ext4_lblk_t len) +{ + struct ext4_de_tree *tree = &EXT4_I(inode)->i_de_tree; + struct rb_node **p = &tree->root.rb_node; + struct rb_node *parent = NULL; + struct delayed_extent *de; + ext4_lblk_t end = offset + len; + + BUG_ON(end <= offset); + + de = tree->cache_de; + de_debug("add [%u/%u) to delayed extent list of inode %lu\n", + offset, len, inode->i_ino); + + if (de && delayed_extent_end(de) == offset) { + de->len += len; + ext4_de_try_to_merge_right(tree, de); + goto out; + } else if (de && de->start == end) { + de->start = offset; + de->len += len; + ext4_de_try_to_merge_left(tree, de); + goto out; + } else if (de && de->start <= offset && + delayed_extent_end(de) >= end) + goto out; + + while (*p) { + parent = *p; + de = rb_entry(parent, struct delayed_extent, rb_node); + + if (offset < de->start) { + if (end == de->start) { + de->len += len; + de->start = offset; + goto out; + } + p = &(*p)->rb_left; + } else if (offset > delayed_extent_end(de)) { + if (delayed_extent_end(de) == offset) { + de->len += len; + goto out; + } + p = &(*p)->rb_right; + } else + goto out; + } + + de = ext4_de_alloc_extent(offset, len); + if (!de) + return -ENOMEM; + rb_link_node(&de->rb_node, parent, p); + rb_insert_color(&de->rb_node, &tree->root); + +out: + tree->cache_de = de; + ext4_de_print_tree(inode); + + return 0; +} + +/* + * ext4_de_remove_space() removes a space from a delayed extent tree. + * Caller holds inode->i_data_sem. + * + * Return 0 on success, error code on failure. + */ +int ext4_de_remove_space(struct inode *inode, ext4_lblk_t offset, + ext4_lblk_t len) +{ + struct rb_node *node; + struct ext4_de_tree *tree; + struct delayed_extent *de; + struct delayed_extent orig_de; + ext4_lblk_t len1, len2, end; + + de_debug("remove [%u/%u) from delayed extent list of inode %lu\n", + offset, len, inode->i_ino); + + end = offset + len; + BUG_ON(end <= offset); + tree = &EXT4_I(inode)->i_de_tree; + de = __de_tree_search(&tree->root, offset); + if (!de) + goto out; + + orig_de.start = de->start; + orig_de.len = de->len; + len1 = offset > de->start ? offset - de->start : 0; + len2 = delayed_extent_end(de) > end ? + delayed_extent_end(de) - end : 0; + if (len1 > 0) + de->len = len1; + if (len2 > 0) { + if (len1 > 0) { + int err; + err = ext4_de_add_space(inode, end, len2); + if (err) { + de->start = orig_de.start; + de->len = orig_de.len; + return err; + } + } else { + de->start = end; + de->len = len2; + } + goto out; + } + + if (len1 > 0) { + node = rb_next(&de->rb_node); + if (!node) + de = rb_entry(node, struct delayed_extent, rb_node); + else + de = NULL; + } + + while (de && delayed_extent_end(de) <= end) { + node = rb_next(&de->rb_node); + rb_erase(&de->rb_node, &tree->root); + if (de == tree->cache_de) + tree->cache_de = NULL; + ext4_de_free_extent(de); + if (!node) { + de = NULL; + break; + } + de = rb_entry(node, struct delayed_extent, rb_node); + } + + if (de && de->start < end) { + len1 = delayed_extent_end(de) - end; + de->start = end; + de->len = len1; + } + +out: + ext4_de_print_tree(inode); + return 0; +} diff --git a/fs/ext4/delayed_extents.h b/fs/ext4/delayed_extents.h index 66d6390..3f649d9 100644 --- a/fs/ext4/delayed_extents.h +++ b/fs/ext4/delayed_extents.h @@ -8,6 +8,13 @@ #ifndef _EXT4_DELAYED_EXTENTS_H #define _EXT4_DELAYED_EXTENTS_H +#define DE_DEBUG +#ifdef DE_DEBUG +#define de_debug(a...) printk(a) +#else +#define de_debug(a...) +#endif + struct delayed_extent { struct rb_node rb_node; ext4_lblk_t start; /* first block extent covers */ @@ -19,4 +26,15 @@ struct ext4_de_tree { struct delayed_extent *cache_de; /* recently accessed extent */ }; +extern int __init ext4_init_de(void); +extern void ext4_exit_de(void); +extern void ext4_de_init_tree(struct ext4_de_tree *tree); + +extern int ext4_de_add_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern ext4_lblk_t ext4_de_find_extent(struct inode *inode, + struct delayed_extent *de); + #endif /* _EXT4_DELAYED_EXTENTS_H */ -- 1.7.5.1 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html