This patch adds operations on a delayed extent tree. Signed-off-by; Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> Signed-off-by: Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> --- fs/ext4/Makefile | 2 +- fs/ext4/delayed_extents.c | 390 +++++++++++++++++++++++++++++++++++++++++++++ fs/ext4/delayed_extents.h | 25 +++ 3 files changed, 416 insertions(+), 1 deletions(-) create mode 100644 fs/ext4/delayed_extents.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 56fd8f86..ee16ad3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -7,7 +7,7 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o + mmp.o indirect.o delayed_extents.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/delayed_extents.c b/fs/ext4/delayed_extents.c new file mode 100644 index 0000000..6275b19 --- /dev/null +++ b/fs/ext4/delayed_extents.c @@ -0,0 +1,390 @@ +/* + * fs/ext4/delayed_extents.c + * + * Written by Yongqiang Yang <xiaoqiangnk@xxxxxxxxx> + * + * Ext4 delayed-extents-list core functions. + */ + +#include "ext4.h" +#include "delayed_extents.h" +#include "ext4_extents.h" + +/* + * delayed extent implementation for ext4. + * + * + * ========================================================================== + * 1. Why delayed extent implementation ? + * + * Without delayed extent, ext4 identifies a delayed extent by looking up + * page cache, this has several deficiencies - complicated, buggy, and + * inefficient code. + * + * FIEMAP, SEEK_HOLE/DATA, bigalloc, punch hole and writeout all need to know if + * a block or a range of blocks are belonged to a delayed extent. + * + * Let us have a look at how they do without delayed extents implementation. + * -- FIEMAP + * FIEMAP looks up page cache to identify delayed allocations from holes. + * + * -- SEEK_HOLE/DATA + * SEEK_HOLE/DATA has the same problem as FIEMAP. + * + * -- bigalloc + * bigalloc looks up page cache to figure out if a block is already + * under delayed allocation or not to determine whether quota reserving + * is needed for the cluster. + * + * -- punch hole + * punch hole looks up page cache to identify a delayed extent. + * + * -- writeout + * Writeout looks up whole page cache to see if a buffer is mapped, If + * there are not very many delayed buffers, then it is time comsuming. + * + * With delayed extents implementation, FIEMAP, SEEK_HOLE/DATA, bigalloc and + * writeout can figure out if a block or a range of blocks is under delayed + * allocation(belonged to a delayed extent) or not by searching the delayed + * extent list. + * + * + * ========================================================================== + * 2. ext4 delayed extents impelmentation + * + * -- delayed extent + * A delayed extent is a range of blocks which are contiguous logically and + * under delayed allocation. Unlike extent in ext4, delayed extent in ext4 + * is a in-memory struct, there is no corresponding on-disk data. There is + * no limit on length of delayed extent, so a delayed extent can contain as + * many blocks as they are contiguous logically. + * + * -- delayed extent list + * Every inode has a delayed extent list and all under delayed allocation + * blocks are added to the list as dealyed extents. Delayed extents in + * the list are sorted by logical block no. + * + * -- operations on a delayed extent list + * There are three operations on a delayed extent list: find next delayed + * adding a space(a range of blocks) to a list and removing a space from + * a list. + * + * -- race on a delayed extent list + * Delayed extent list is protected inode->i_data_sem like extent tree. + * + * + * ========================================================================== + * 3. performance analysis + * -- overhead + * 1. Apart from operations on a delayed extent list, we need to + * down_write(inode->i_data_sem) in delayed write path to maintain delayed + * extent list, this can have impact on parallel read-write and write-write + * + * 2. There is a cache extent for write access, so if writes are not very + * random, adding space operaions are in O(1) time. + * + * -- gain + * 3. Code is much simpler, more readable, more maintainable and + * more efficient. + * + * 4. TODO + * -- cache_de in de_find_extent + * de_find_extent is used heavily by bgalloc, so de_find_extent should use + * cache_de. + * + * ========================================================================== + * 4. More sophiscated data structure like tree ? + * Does we need rb tree instead? + * + */ +static struct kmem_cache *ext4_de_cachep; + +int __init ext4_init_de(void) +{ + ext4_de_cachep = KMEM_CACHE(ext4_delayed_extent, + SLAB_RECLAIM_ACCOUNT); + if (ext4_de_cachep == NULL) + return -ENOMEM; + return 0; +} + +void ext4_exit_de(void) +{ + kmem_cache_destroy(ext4_de_cachep); +} + +#ifdef DE_DEBUG +void ext4_de_print_list(struct inode *inode) +{ + struct ext4_delayed_info *delayed_info; + struct list_head *cur; + + printk(KERN_DEBUG "delayed extents for inode %lu:", inode->i_ino); + delayed_info = &EXT4_I(inode)->i_delayed_info; + list_for_each(cur, &delayed_info->de_list) { + struct ext4_delayed_extent *de; + de = list_entry(cur, struct ext4_delayed_extent, de_list); + printk(KERN_DEBUG " [%u, %u)", de->de_start, de->de_end); + } + printk(KERN_DEBUG "\n"); +} +#else +#define ext4_de_print_list(inode) +#endif + +#define list_tail(list, type, member) \ + list_entry((list)->prev, type, member) + +#define list_next(head, elm, member) \ + ((elm)->member.next != head ? \ + list_entry((elm)->member.next, typeof(*elm), member) : NULL) + +#define list_prev(head, elm, member) \ + ((elm)->member.prev != head ? \ + list_entry((elm)->member.prev, typeof(*elm), member) : NULL) + +/* + * ext4_de_first_extent_since: find the 1st delayed extent covering @start + * if it exists, otherwise, the next extent after @start. + * + * @inode: the inode which owns delayed extents + * @start: logic block + * + * Return next delayed block after the found extent. Delayed extent is returned + * the extent is returned via @de. + */ +ext4_lblk_t ext4_de_find_extent(struct inode *inode, + struct ext4_delayed_extent *de) +{ + struct ext4_delayed_info *delayed_info; + struct list_head *cur; + struct ext4_delayed_extent *de1; + + de->de_end = 0; + delayed_info = &EXT4_I(inode)->i_delayed_info; + if (list_empty(&delayed_info->de_list)) + return EXT_MAX_BLOCKS; + + de1 = list_tail(&delayed_info->de_list, struct ext4_delayed_extent, + de_list); + if (de->de_start >= de1->de_end) + return EXT_MAX_BLOCKS; + + list_for_each(cur, &delayed_info->de_list) { + de1 = list_entry(cur, struct ext4_delayed_extent, de_list); + if (de->de_end != 0) + return de1->de_start; + + if (de1->de_end <= de->de_start) + /* @de is followed by @*start and they are + * not contiguous. + * |------------|.........| + * de *start + */ + continue; + de->de_start = de1->de_start; + de->de_end = de1->de_end; + } + + BUG_ON(de->de_end == 0); + return EXT_MAX_BLOCKS; +} + +static struct ext4_delayed_extent * +ext4_de_add_entry(struct list_head *head, ext4_lblk_t start, ext4_lblk_t end) +{ + struct ext4_delayed_extent *de; + de = kmem_cache_alloc(ext4_de_cachep, GFP_NOFS); + if (de == NULL) + return NULL; + de->de_start = start; + de->de_end = end; + INIT_LIST_HEAD(&de->de_list); + list_add(&de->de_list, head); + return de; +} + +static void ext4_de_remove_entry(struct inode *inode, + struct ext4_delayed_extent *de) +{ + struct ext4_delayed_info *delayed_info = &EXT4_I(inode)->i_delayed_info; + + list_del(&de->de_list); + kmem_cache_free(ext4_de_cachep, de); + if (delayed_info->cache_de == de) + delayed_info->cache_de = NULL; +} + +static void ext4_de_try_to_merge_left(struct inode *inode, + struct ext4_delayed_extent *de) +{ + struct ext4_delayed_extent *de1; + + de1 = list_prev(&EXT4_I(inode)->i_delayed_info.de_list, de, de_list); + if (de1 && de1->de_end == de->de_start) { + de->de_start = de1->de_start; + ext4_de_remove_entry(inode, de1); + } +} + +static void ext4_de_try_to_merge_right(struct inode *inode, + struct ext4_delayed_extent *de) +{ + struct ext4_delayed_extent *de1; + + de1 = list_next(&EXT4_I(inode)->i_delayed_info.de_list, de, de_list); + if (de1 && de1->de_start == de->de_end) { + de->de_end = de1->de_end; + ext4_de_remove_entry(inode, de1); + } +} + +/* + * ext4_de_add_space() adds a space to a delayed extent list. + * Caller hold inode->i_data_sem. + * + * Return 0 on success, error code on failure. + */ +int ext4_de_add_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t len) +{ + ext4_lblk_t end = start + len; + struct ext4_delayed_info *delayed_info; + struct ext4_delayed_extent *de; + struct list_head *cur; + + /* ext4_de_add_space is called with len = 1. */ + BUG_ON(len != 1); + delayed_info = &EXT4_I(inode)->i_delayed_info; + de = delayed_info->cache_de; + de_debug("add [%u, %u) to delayed extent list of inode %lu\n", + start, end, inode->i_ino); + + if (de && de->de_end == start) { + de->de_end = end; + ext4_de_try_to_merge_right(inode, de); + goto out; + } else if (de && de->de_start == end) { + de->de_start = start; + ext4_de_try_to_merge_left(inode, de); + goto out; + } else if (de && de->de_start <= start && de->de_end >= end) + goto out; + + /* We traverse the list backwards. */ + list_for_each_prev(cur, &delayed_info->de_list) { + de = list_entry(cur, struct ext4_delayed_extent, de_list); + if (de->de_start > end) + /* de folllows [start, end) and they are + * not contiguous. + * |------------|----|-----------| + * [start, end) de + */ + continue; + + if (de->de_end == start) { + /* de is followed by [start, end) and they are + * contiguous. + * |-----------|---------------| + * de [start, end) + */ + de->de_end = end; + goto out; + } else if (de->de_start == end) { + /* de follows [start, end) and they are contiguous. + * |-------------|------------| + * [start, end) de + */ + de->de_start = start; + ext4_de_try_to_merge_left(inode, de); + goto out; + } else if (start >= de->de_start && end <= de->de_end) + goto out; + + break; + } + + de = ext4_de_add_entry(cur, start, end); + if (!de) + return -ENOMEM; +out: + delayed_info->cache_de = de; + ext4_de_print_list(inode); + + return 0; +} + +/* + * ext4_de_remove_space() removes a space from a delayed extent list. + * Caller hold inode->i_data_sem. + * + * Return 0 on success, error code on failure. + */ +int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len) +{ + ext4_lblk_t end = start + len; + struct ext4_delayed_info *delayed_info; + struct list_head *cur; + + delayed_info = &EXT4_I(inode)->i_delayed_info; + + de_debug("remove [%u, %u) from delayed extent list of inode %lu\n", + start, end, inode->i_ino); + + list_for_each(cur, &delayed_info->de_list) { + struct ext4_delayed_extent *de, *new_de; + new_de = NULL; + de = list_entry(cur, struct ext4_delayed_extent, de_list); + if (de->de_end <= start) + /* de resides before [start, end) + * |---------|...|-----------| + * de [start, end) + */ + continue; + + if (de->de_start >= end) + /* de resides after [start, end) + * |---------|...|-----------| + * [start,end) de + */ + break; + + /* de and [start, end) intersect */ + if (de->de_start < start && de->de_end <= end) { + /* + * |-----------------|--------|.........| + * de->de_start start de->de_end end + */ + de->de_end = start; + start = de->de_end; + } else if ((de->de_start >= start && de->de_end <= end)) { + /* + * |............|--------|.........| + * start de->start de->end end + */ + start = de->de_end; + cur = de->de_list.prev; + ext4_de_remove_entry(inode, de); + } else if (de->de_start < start && de->de_end > end) { + /* + * |------------|--------|---------| + * de->start start end de->de_end + */ + if (!ext4_de_add_entry(&de->de_list, end, de->de_end)) + return -ENOMEM; + de->de_end = start; + break; + } else if (de->de_start >= start && de->de_end > end) { + /* + * |.............|---------|--------| + * start de->de_start end de->de_end + */ + de->de_start = end; + break; + } else + BUG(); + } + + ext4_de_print_list(inode); + return 0; +} diff --git a/fs/ext4/delayed_extents.h b/fs/ext4/delayed_extents.h index b7baaef..dd3bdb5 100644 --- a/fs/ext4/delayed_extents.h +++ b/fs/ext4/delayed_extents.h @@ -8,6 +8,21 @@ #ifndef _EXT4_DELAYED_EXTENTS_H #define _EXT4_DELAYED_EXTENTS_H +/* + * Turn on DE_DEBUG to get lots of info about delayed extents operations. + */ +#define DE_DEBUG__ +#ifdef DE_DEBUG +#define de_debug(f, a...) \ + do { \ + printk(KERN_DEBUG "EXT4-fs delayed-extents (%s, %d): " \ + "%s:", __FILE__, __LINE__, __func__); \ + printk(KERN_DEBUG f, ## a); \ + } while (0) +#else +#define de_debug(f, a...) +#endif + struct ext4_delayed_extent { struct list_head de_list; ext4_lblk_t de_start; /* first block extent covers */ @@ -19,4 +34,14 @@ struct ext4_delayed_info { struct ext4_delayed_extent *cache_de; /* recently accessed extent */ }; +extern int __init ext4_init_de(void); +extern void ext4_exit_de(void); + +extern int ext4_de_add_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern int ext4_de_remove_space(struct inode *inode, ext4_lblk_t start, + ext4_lblk_t len); +extern ext4_lblk_t ext4_de_find_extent(struct inode *inode, + struct ext4_delayed_extent *de); + #endif /* _EXT4_DELAYED_EXTENTS_H */ -- 1.7.5.1 -- To unsubscribe from this list: send the line "unsubscribe linux-ext4" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html