From: Yuexin Li <lyx1209@xxxxxxxxx> This patch adds a new freespace tree allocator. The allocator organizes free space regions in a couple of in-memory rbtrees, one sorted by length and the other by offset. We use these per-flex-bg level trees to quickly scan length sorted free space regions and determine the best region for a given request. This feature can be enabled by passing "-o freespace_tree" mount option. Signed-off-by: Yuexin Li <lyx1209@xxxxxxxxx> Co-developed-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> --- fs/ext4/ext4.h | 45 +++ fs/ext4/mballoc.c | 984 ++++++++++++++++++++++++++++++++++++++++++++-- fs/ext4/mballoc.h | 61 ++- fs/ext4/resize.c | 8 + fs/ext4/super.c | 35 +- 5 files changed, 1084 insertions(+), 49 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4df6f429de1a..3bb2675d4d40 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -363,6 +363,7 @@ struct flex_groups { atomic64_t free_clusters; atomic_t free_inodes; atomic_t used_dirs; + struct ext4_frsp_tree *frsp_tree; }; #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ @@ -1197,6 +1198,12 @@ struct ext4_inode_info { #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly specified journal checksum */ + +#define EXT4_MOUNT2_FREESPACE_TREE 0x00000020 /* Enable freespace tree + * allocator (turns buddy + * allocator off) + */ + #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ @@ -1402,6 +1409,30 @@ struct ext4_super_block { #define ext4_has_strict_mode(sbi) \ (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) +/* + * Freespace tree structure + */ +struct ext4_frsp_tree { + struct rb_root_cached frsp_offset_root; /* Root for offset + * sorted tree + */ + struct rb_root_cached frsp_len_root; /* Root for Length + * sorted tree + */ + struct mutex frsp_lock; + __u8 frsp_flags; + __u32 frsp_max_free_len; /* Max free extent in + * this flex bg + */ + __u32 frsp_index; /* Tree index (flex bg + * number) + */ +}; + +/* Freespace tree flags */ + +/* Tree is loaded in memory */ +#define EXT4_MB_FRSP_FLAG_LOADED 0x0001 /* * fourth extended-fs super-block data in memory */ @@ -1539,6 +1570,9 @@ struct ext4_sb_info { struct flex_groups * __rcu *s_flex_groups; ext4_group_t s_flex_groups_allocated; + /* freespace trees stuff */ + int s_mb_num_frsp_trees; + /* workqueue for reserved extent conversions (buffered io) */ struct workqueue_struct *rsv_conversion_wq; @@ -2653,6 +2687,7 @@ extern int ext4_init_inode_table(struct super_block *sb, extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); /* mballoc.c */ +extern int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups); extern const struct seq_operations ext4_mb_seq_groups_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; @@ -3084,6 +3119,16 @@ static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, return block_group >> sbi->s_log_groups_per_flex; } +static inline struct ext4_frsp_tree * +ext4_get_frsp_tree(struct super_block *sb, ext4_group_t flex_bg) +{ + struct flex_groups *fg; + + fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, flex_bg); + + return fg->frsp_tree; +} + static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) { return 1 << sbi->s_log_groups_per_flex; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 2d8d3d9c7918..74bdc2dcb49c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -333,6 +333,7 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; static struct kmem_cache *ext4_free_data_cachep; +static struct kmem_cache *ext4_freespace_node_cachep; /* We create slab caches for groupinfo data structures based on the * superblock block size. There will be one per mounted filesystem for @@ -352,6 +353,11 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b, int flags); +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex); +static void ext4_mb_unload_allocator(struct ext4_buddy *e4b); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -395,6 +401,33 @@ static inline void *mb_correct_addr_and_bit(int *bit, void *addr) return addr; } +static inline int ext4_blkno_to_flex_offset(struct super_block *sb, + ext4_fsblk_t blkno) +{ + return blkno % (ext4_flex_bg_size(EXT4_SB(sb)) * + EXT4_SB(sb)->s_blocks_per_group); +} + +static inline ext4_fsblk_t ext4_flex_offset_to_blkno(struct super_block *sb, + int flex_bg, int flex_offset) +{ + return flex_bg * ext4_flex_bg_size(EXT4_SB(sb)) * + EXT4_SB(sb)->s_blocks_per_group + flex_offset; +} + +static inline int ext4_mb_frsp_on(struct super_block *sb) +{ + return test_opt2(sb, FREESPACE_TREE) && + EXT4_SB(sb)->s_es->s_log_groups_per_flex; +} + +static inline unsigned int ext4_num_grps_to_flexbg(struct super_block *sb, + int ngroups) +{ + return (ngroups + ext4_flex_bg_size(EXT4_SB(sb)) - 1) >> + (EXT4_SB(sb)->s_es->s_log_groups_per_flex); +} + static inline int mb_test_bit(int bit, void *addr) { /* @@ -453,6 +486,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) { char *bb; + WARN_ON(ext4_mb_frsp_on(e4b->bd_sb)); BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); BUG_ON(max == NULL); @@ -620,6 +654,9 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy; void *buddy2; + if (ext4_mb_frsp_on(sb)) + return 0; + { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) @@ -706,6 +743,729 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, #define mb_check_buddy(e4b) #endif +/* Generic macro for inserting a new node in cached rbtree */ +#define ext4_mb_rb_insert(__root, __new, __node_t, __entry, __cmp) do { \ + struct rb_node **iter = &((__root)->rb_root.rb_node), *parent = NULL; \ + bool leftmost = true; \ + __node_t *this = NULL; \ + \ + while (*iter) { \ + this = rb_entry(*iter, __node_t, __entry); \ + parent = *iter; \ + if (__cmp((__new), this)) { \ + iter = &((*iter)->rb_left); \ + } else { \ + iter = &((*iter)->rb_right); \ + leftmost = false; \ + } \ + } \ + rb_link_node(&(__new)->__entry, parent, iter); \ + rb_insert_color_cached(&(__new)->__entry, __root, leftmost); \ +} while (0) + +static inline int ext4_mb_frsp_offset_cmp(struct ext4_frsp_node *arg1, + struct ext4_frsp_node *arg2) +{ + return (arg1->frsp_offset < arg2->frsp_offset); +} + +/* + * Free space extents length sorting function, the nodes are sorted in + * decreasing order of length + */ +static inline int ext4_mb_frsp_len_cmp(struct ext4_frsp_node *arg1, + struct ext4_frsp_node *arg2) +{ + return (arg1->frsp_len > arg2->frsp_len); +} + +/* insert to offset-indexed tree */ +static void ext4_mb_frsp_insert_off(struct ext4_frsp_tree *tree, + struct ext4_frsp_node *new_entry) +{ + memset(&new_entry->frsp_node, 0, sizeof(new_entry->frsp_node)); + ext4_mb_rb_insert(&tree->frsp_offset_root, new_entry, + struct ext4_frsp_node, frsp_node, ext4_mb_frsp_offset_cmp); +} + +/* insert to tree sorted by length */ +static void ext4_mb_frsp_insert_len(struct ext4_frsp_tree *tree, + struct ext4_frsp_node *new_entry) +{ + memset(&new_entry->frsp_len_node, 0, sizeof(new_entry->frsp_len_node)); + ext4_mb_rb_insert(&tree->frsp_len_root, new_entry, + struct ext4_frsp_node, frsp_len_node, + ext4_mb_frsp_len_cmp); +} + +#ifdef CONFIG_EXT4_DEBUG +/* print freespace_tree in pre-order traversal */ +void ext4_mb_frsp_print_tree_len(struct super_block *sb, + struct ext4_frsp_tree *tree) +{ + unsigned int count = 0; + ext4_fsblk_t blk = 0, blk_end = 0; + ext4_group_t group = 0, group_end = 0; + struct ext4_frsp_node *entry = NULL; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct rb_node *cur; + + cur = rb_first_cached(&tree->frsp_len_root); + mb_debug(sb, "\nTree\tNode\tlength\tblock\tgroup\n"); + + while (cur) { + entry = rb_entry(cur, struct ext4_frsp_node, frsp_len_node); + count++; + blk = ext4_flex_offset_to_blkno(sb, tree->frsp_index, + entry->frsp_offset); + blk_end = ext4_flex_offset_to_blkno(sb, tree->frsp_index, + entry->frsp_offset + entry->frsp_len - 1); + group = blk / sbi->s_blocks_per_group; + group_end = (blk_end-1) / sbi->s_blocks_per_group; + mb_debug(sb, "%u\t%u\t%u\t%u(%lld)--%llu\t%u--%u\n", + tree->frsp_index, count, entry->frsp_len, + entry->frsp_offset, blk, blk_end, group, group_end); + cur = rb_next(cur); + } +} +#endif + +static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb) +{ + struct ext4_frsp_node *node; + + node = kmem_cache_alloc(ext4_freespace_node_cachep, GFP_NOFS); + if (!node) + return NULL; + + RB_CLEAR_NODE(&node->frsp_node); + RB_CLEAR_NODE(&node->frsp_len_node); + + return node; +} + +static void ext4_mb_frsp_free_node(struct super_block *sb, + struct ext4_frsp_node *node) +{ + kmem_cache_free(ext4_freespace_node_cachep, node); +} + +/* Evict a tree from memory */ +void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree) +{ + struct ext4_frsp_node *frsp_node; + struct rb_node *node; + + mb_debug(sb, "Evicting tree %d\n", tree->frsp_index); + mutex_lock(&tree->frsp_lock); + if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED)) + goto out; + + node = rb_first_cached(&tree->frsp_offset_root); + while (node) { + frsp_node = rb_entry(node, struct ext4_frsp_node, frsp_node); + rb_erase_cached(node, &tree->frsp_offset_root); + rb_erase_cached(&frsp_node->frsp_len_node, + &tree->frsp_len_root); + ext4_mb_frsp_free_node(sb, frsp_node); + node = rb_first_cached(&tree->frsp_offset_root); + } + tree->frsp_flags &= ~EXT4_MB_FRSP_FLAG_LOADED; + tree->frsp_offset_root = RB_ROOT_CACHED; + tree->frsp_len_root = RB_ROOT_CACHED; +out: + mutex_unlock(&tree->frsp_lock); +} + +/* + * Search tree by flexbg offset. Returns the node containing "target". If + * no such node is present, returns NULL. Must be called with tree mutex held. + */ +struct ext4_frsp_node *ext4_mb_frsp_search_by_off(struct super_block *sb, + struct ext4_frsp_tree *tree, + ext4_grpblk_t target) +{ + struct rb_root *root = &tree->frsp_offset_root.rb_root; + struct rb_node *node = root->rb_node; + struct ext4_frsp_node *this = NULL; + + while (node) { + this = rb_entry(node, struct ext4_frsp_node, frsp_node); + if (this->frsp_offset == target) + return this; + else if (target < this->frsp_offset) + node = node->rb_left; + else + node = node->rb_right; + } + return NULL; +} + +/* + * Check if two entries in freespace tree can be merged together. Nodes + * can merge together only if they are physically contiguous and belong + * to the same block group. + */ +int ext4_mb_frsp_node_can_merge(struct super_block *sb, + struct ext4_frsp_node *prev_entry, struct ext4_frsp_node *cur_entry) +{ + if (prev_entry->frsp_offset + prev_entry->frsp_len != + cur_entry->frsp_offset) + return 0; + if (prev_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group != + cur_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group) + return 0; + return 1; +} + +/* + * Add new free space region to tree. We insert the new node in both, the + * length sorted and the flex-bg offset sorted trees. Must be called with + * tree mutex held. + */ +int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree, + ext4_grpblk_t offset, ext4_grpblk_t len) +{ + struct ext4_frsp_node *new_entry = NULL, *next_entry = NULL, + *prev_entry = NULL; + struct rb_node *left = NULL, *right = NULL; + + new_entry = ext4_mb_frsp_alloc_node(sb); + if (!new_entry) + return -ENOMEM; + + new_entry->frsp_offset = offset; + new_entry->frsp_len = len; + + ext4_mb_frsp_insert_off(tree, new_entry); + /* try merge to left and right */ + /* left */ + left = rb_prev(&new_entry->frsp_node); + if (left) { + prev_entry = rb_entry(left, struct ext4_frsp_node, frsp_node); + if (ext4_mb_frsp_node_can_merge(sb, prev_entry, new_entry)) { + new_entry->frsp_offset = prev_entry->frsp_offset; + new_entry->frsp_len += prev_entry->frsp_len; + rb_erase_cached(left, &tree->frsp_offset_root); + rb_erase_cached(&prev_entry->frsp_len_node, + &tree->frsp_len_root); + ext4_mb_frsp_free_node(sb, prev_entry); + } + } + + /* right */ + right = rb_next(&new_entry->frsp_node); + if (right) { + next_entry = rb_entry(right, struct ext4_frsp_node, frsp_node); + if (ext4_mb_frsp_node_can_merge(sb, new_entry, next_entry)) { + new_entry->frsp_len += next_entry->frsp_len; + rb_erase_cached(right, &tree->frsp_offset_root); + rb_erase_cached(&next_entry->frsp_len_node, + &tree->frsp_len_root); + ext4_mb_frsp_free_node(sb, next_entry); + } + } + ext4_mb_frsp_insert_len(tree, new_entry); + + return 0; +} + +/* + * Free length number of blocks starting at block number block in free space + * tree. + */ +int ext4_mb_frsp_free_blocks(struct super_block *sb, ext4_group_t group, + ext4_grpblk_t block, ext4_grpblk_t length) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_frsp_tree *tree = + ext4_get_frsp_tree(sb, ext4_flex_group(sbi, group)); + int err = 0; + + mutex_lock(&tree->frsp_lock); + err = ext4_mb_frsp_add_region(sb, tree, + ext4_blkno_to_flex_offset(sb, block), length); + mutex_unlock(&tree->frsp_lock); + + return err; +} + +/* + * Create freespace tree from on-disk bitmap. Must be called with tree mutex + * held. Returns 0 on success, error otherwise. + */ +int ext4_mb_frsp_bb_to_tree(struct ext4_buddy *e4b, ext4_group_t group, + struct buffer_head *bh) +{ + struct super_block *sb = e4b->bd_sb; + int length = 0, bit = 0, next; + int end = EXT4_SB(sb)->s_blocks_per_group; + ext4_fsblk_t block; + int ret = 0; + + /* find all unused blocks in bitmap, convert them to new tree node */ + while (bit < end) { + bit = mb_find_next_zero_bit(bh->b_data, end, bit); + if (bit >= end) + break; + + next = mb_find_next_bit(bh->b_data, end, bit); + length = next - bit; + block = ext4_group_first_block_no(sb, group) + bit; + + ret = ext4_mb_frsp_add_region(sb, e4b->frsp_tree, + ext4_blkno_to_flex_offset(sb, block), length); + if (ret) + break; + bit = next + 1; + } + + return ret; +} + +/* + * Load one freespace_tree from disk. Assume holding mutex lock on tree. + */ +int ext4_mb_frsp_load(struct ext4_buddy *e4b) +{ + ext4_group_t ngroups, group_start, group_end, grp; + struct ext4_sb_info *sbi = EXT4_SB(e4b->bd_sb); + int i, ret = 0; + struct buffer_head **bh = NULL; + + if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) + return 0; + + group_start = e4b->frsp_tree->frsp_index * ext4_flex_bg_size(sbi); + group_end = min(group_start + ext4_flex_bg_size(sbi), + ext4_get_groups_count(e4b->bd_sb)) - 1; + ngroups = group_end - group_start + 1; + + bh = kcalloc(ngroups, sizeof(*bh), GFP_KERNEL); + if (!bh) + return -ENOMEM; + for (i = 0; i < ngroups; i++) { + grp = i + group_start; + bh[i] = ext4_read_block_bitmap_nowait(e4b->bd_sb, grp, false); + if (IS_ERR(bh[i])) { + ret = PTR_ERR(bh[i]); + goto out; + } + } + for (i = 0; i < ngroups; i++) { + grp = i + group_start; + if (!bitmap_uptodate(bh[i])) { + ret = ext4_wait_block_bitmap(e4b->bd_sb, grp, bh[i]); + if (ret) + goto out; + } + ret = ext4_mb_frsp_bb_to_tree(e4b, grp, bh[i]); + if (ret) + goto out; + } + e4b->frsp_tree->frsp_flags |= EXT4_MB_FRSP_FLAG_LOADED; +out: + for (i = 0; i < ngroups; i++) { + if (!IS_ERR_OR_NULL(bh[i])) + put_bh(bh[i]); + if (!ret && IS_ERR(bh[i])) + ret = PTR_ERR(bh[i]); + } + kfree(bh); + return ret; + +} + +/* + * Determine if node with length len is better than what we have found until + * now. Return 1 if that is the case, 0 otherwise. If len is exact match, + * set *best = 1. + */ +static int ext4_mb_frsp_is_better(struct ext4_allocation_context *ac, + int len, int *best) +{ + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; + struct ext4_free_extent *gex = &ac->ac_g_ex; + + if (best) + *best = 0; + if (len == gex->fe_len) { + if (best) + *best = 1; + return 1; + } + if (ac->ac_criteria == 0 && len < gex->fe_len) + return 0; + /* + * See if the new node is cloer to goal length than + * the best extent found so far + */ + if (btx->te_len < gex->fe_len && len > btx->te_len) + return 1; + if (len > gex->fe_len && len < btx->te_len) + return 1; + return 0; +} + +/* + * check if we have scanned sufficient freespace candidates + * stop scanning if reached/exceeded s_max_to_scan + */ +static void ext4_mb_frsp_check_limits(struct ext4_allocation_context *ac) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + + if (ac->ac_status == AC_STATUS_FOUND) + return; + + /* + * Exceeded max number of nodes to scan + */ + if (ac->ac_found > sbi->s_mb_max_to_scan && + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) + ac->ac_status = AC_STATUS_BREAK; +} + +/* + * Mark free space in selected tree node as used and update the tree. + * This must be called with tree lock. + */ +static void ext4_mb_frsp_use_best_found(struct ext4_allocation_context *ac, + ext4_group_t flex, struct ext4_frsp_node *selected) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; + struct ext4_free_extent *bex; + unsigned int group_no; + struct ext4_buddy e4b; + + WARN_ON(ac->ac_status == AC_STATUS_FOUND); + btx->te_len = min(btx->te_len, ac->ac_g_ex.fe_len); + ac->ac_status = AC_STATUS_FOUND; + + /* update ac best-found-extent */ + bex = &ac->ac_b_ex; + group_no = (btx->te_flex * ext4_flex_bg_size(sbi)) + + (btx->te_flex_start / sbi->s_blocks_per_group); + bex->fe_start = btx->te_flex_start % sbi->s_blocks_per_group; + bex->fe_group = group_no; + bex->fe_len = btx->te_len; + bex->fe_node = selected; + + /* Add free blocks to our tree */ + ext4_mb_load_allocator(ac->ac_sb, group_no, &e4b, + EXT4_ALLOCATOR_FRSP_NOLOAD); + mb_mark_used(&e4b, bex); + ext4_mb_unload_allocator(&e4b); +} +/* + * The routine checks whether found tree node is good enough. If it is, + * then the tree node gets marked used and flag is set to the context + * to stop scanning. + * + * Otherwise, the tree node is compared with the previous found extent and + * if new one is better, then it's stored in the context. + * + * Later, the best found tree node will be used, if mballoc can't find good + * enough extent. + */ +static int ext4_mb_frsp_measure_node(struct ext4_allocation_context *ac, + int tree_idx, struct ext4_frsp_node *cur, + int goal) +{ + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; + ext4_fsblk_t start; + int best_found = 0, max; + + WARN_ON(btx->te_len < 0); + WARN_ON(ac->ac_status != AC_STATUS_CONTINUE); + + if (goal) { + start = ext4_group_first_block_no(ac->ac_sb, + ac->ac_g_ex.fe_group) + + ac->ac_g_ex.fe_start; + if (cur->frsp_offset > ext4_blkno_to_flex_offset(ac->ac_sb, + start)) + return 0; + max = cur->frsp_offset + cur->frsp_len - + ext4_blkno_to_flex_offset(ac->ac_sb, start); + if (max >= ac->ac_g_ex.fe_len && + ac->ac_g_ex.fe_len == EXT4_SB(ac->ac_sb)->s_stripe) { + if (do_div(start, EXT4_SB(ac->ac_sb)->s_stripe) != 0) + return 0; + best_found = 1; + } else if (max >= ac->ac_g_ex.fe_len) { + best_found = 1; + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { + /* + * Sometimes, caller may want to merge even small + * number of blocks to an existing extent + */ + best_found = 1; + + } else { + return 0; + } + ac->ac_found++; + goto use_extent; + } + ac->ac_found++; + + /* The special case - take what you catch first */ + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + best_found = 1; + goto use_extent; + } + + /* + * Prefer not allocating blocks in first group in flex bg for data + * blocks. + */ + if (unlikely((ac->ac_criteria < 2) && + (ac->ac_flags & EXT4_MB_HINT_DATA) && + (cur->frsp_offset < EXT4_BLOCKS_PER_GROUP(ac->ac_sb)))) + return 1; + + + /* If this is first found extent, just store it in the context */ + if (btx->te_len == 0) + goto use_extent; + + if (ext4_mb_frsp_is_better(ac, cur->frsp_len, &best_found)) + goto use_extent; + + ext4_mb_frsp_check_limits(ac); + return 0; + +use_extent: + btx->te_flex = tree_idx; + if (goal) { + btx->te_flex_start = ext4_blkno_to_flex_offset(ac->ac_sb, + start); + btx->te_len = max; + } else { + btx->te_flex_start = cur->frsp_offset; + btx->te_len = cur->frsp_len; + } + if (best_found) + ext4_mb_frsp_use_best_found(ac, tree_idx, cur); + ext4_mb_frsp_check_limits(ac); + return 1; +} + +/* Search by goal */ +static noinline_for_stack +void ext4_mb_frsp_find_by_goal(struct ext4_allocation_context *ac) +{ + struct ext4_frsp_node *cur = NULL; + unsigned int tree_blk; + ext4_fsblk_t blk; + struct ext4_buddy e4b; + int ret; + + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) + return; + + /* compute start node offset in tree */ + blk = ext4_group_first_block_no(ac->ac_sb, ac->ac_g_ex.fe_group) + + ac->ac_g_ex.fe_start; + tree_blk = ext4_blkno_to_flex_offset(ac->ac_sb, blk); + + ret = ext4_mb_load_allocator(ac->ac_sb, ac->ac_g_ex.fe_group, &e4b, 0); + if (ret) + return; + + /* try goal block and its freespace_tree first */ + mutex_lock(&e4b.frsp_tree->frsp_lock); + cur = ext4_mb_frsp_search_by_off(ac->ac_sb, e4b.frsp_tree, tree_blk); + if (cur && tree_blk < cur->frsp_offset + cur->frsp_len - 1) + ext4_mb_frsp_measure_node(ac, e4b.frsp_tree->frsp_index, cur, + 1 /* searching by goal */); + + mutex_unlock(&e4b.frsp_tree->frsp_lock); + ext4_mb_unload_allocator(&e4b); +} + +static void ext4_mb_frsp_process(struct ext4_allocation_context *ac, + struct ext4_frsp_tree *tree) +{ + struct ext4_frsp_node *cur = NULL; + struct rb_node *node = NULL; + int ret; + + node = rb_first_cached(&tree->frsp_len_root); + mb_debug(ac->ac_sb, "Processing tree index %d, flags = %x\n", + tree->frsp_index, tree->frsp_flags); + /* + * In order to serve the "No data blocks in first group in a flexbg" + * requirement, we cannot do a O(Log N) search here. TODO: it's possible + * to that at CR=2, where that requirement doesn't apply. + */ + while (node && ac->ac_status == AC_STATUS_CONTINUE) { + cur = rb_entry(node, struct ext4_frsp_node, frsp_len_node); + if (ac->ac_criteria == 0 && cur->frsp_len < ac->ac_g_ex.fe_len) + return; + ret = ext4_mb_frsp_measure_node(ac, tree->frsp_index, cur, + 0 /* searching by len */); + if (ret == 0) + return; + node = rb_next(node); + } +} + +/* allocator for freespace_tree */ +static noinline_for_stack int +ext4_mb_tree_allocator(struct ext4_allocation_context *ac) +{ + struct ext4_buddy e4b; + struct ext4_sb_info *sbi; + struct super_block *sb; + struct ext4_frsp_tree *tree = NULL; + struct ext4_frsp_node *cur = NULL; + struct ext4_tree_extent *btx = NULL; + int ret = 0, start_idx = 0, tree_idx, j; + + sb = ac->ac_sb; + btx = &ac->ac_b_tree_ex; + sbi = EXT4_SB(sb); + + start_idx = ext4_flex_group(sbi, ac->ac_g_ex.fe_group); + mb_debug(sb, "requested size %d\n", ac->ac_g_ex.fe_len); + /* First try searching from goal blk in start-idx-th freespace tree */ + ext4_mb_frsp_find_by_goal(ac); + if (ac->ac_status == AC_STATUS_FOUND) + goto out; + + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) + goto out; + + ac->ac_criteria = 0; + ac->ac_groups_scanned = 0; +repeat: + + /* Loop through the rest of trees (flex_bg) */ + for (j = start_idx; + (ac->ac_groups_scanned < sbi->s_mb_num_frsp_trees) && + ac->ac_status == AC_STATUS_CONTINUE; j++) { + ac->ac_groups_scanned++; + tree_idx = (j % sbi->s_mb_num_frsp_trees); + + ret = ext4_mb_load_allocator(sb, + tree_idx * ext4_flex_bg_size(sbi), &e4b, 0); + if (ret) + goto out; + mutex_lock(&e4b.frsp_tree->frsp_lock); + ext4_mb_frsp_process(ac, e4b.frsp_tree); + mutex_unlock(&e4b.frsp_tree->frsp_lock); + ext4_mb_unload_allocator(&e4b); + } + + if (ac->ac_status != AC_STATUS_FOUND) { + if (ac->ac_criteria < 2) { + ac->ac_criteria++; + ac->ac_groups_scanned = 0; + mb_debug(sb, "Falling back to CR=%d", ac->ac_criteria); + goto repeat; + } + if (btx->te_len > 0 && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { + e4b.frsp_flags = 0; + ret = ext4_mb_load_allocator(sb, btx->te_flex * + ext4_flex_bg_size(sbi), &e4b, 0); + if (ret) + goto out; + tree = e4b.frsp_tree; + mutex_lock(&tree->frsp_lock); + cur = ext4_mb_frsp_search_by_off(sb, tree, + btx->te_flex_start); + if (cur) { + ext4_mb_frsp_use_best_found(ac, btx->te_flex, + cur); + mutex_unlock(&tree->frsp_lock); + ext4_mb_unload_allocator(&e4b); + + } else { + /* + * Someone else took this freespace node before + * us. Reset the best-found tree extent, and + * turn on FIRST HINT (greedy). + */ + mutex_unlock(&tree->frsp_lock); + ac->ac_b_tree_ex.te_flex_start = 0; + ac->ac_b_tree_ex.te_flex = 0; + ac->ac_b_tree_ex.te_len = 0; + ac->ac_status = AC_STATUS_CONTINUE; + ac->ac_flags |= EXT4_MB_HINT_FIRST; + ac->ac_groups_scanned = 0; + ext4_mb_unload_allocator(&e4b); + goto repeat; + } + } + } + + ret = btx->te_len ? 0 : -ENOSPC; + +out: + return ret; +} + +static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index) +{ + tree->frsp_offset_root = RB_ROOT_CACHED; + tree->frsp_len_root = RB_ROOT_CACHED; + mutex_init(&(tree->frsp_lock)); + tree->frsp_flags = 0; + tree->frsp_index = index; +} + +int ext4_mb_init_freespace_trees(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct flex_groups *fg; + int i; + + sbi->s_mb_num_frsp_trees = + ext4_num_grps_to_flexbg(sb, ext4_get_groups_count(sb)); + + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { + fg = sbi_array_rcu_deref(sbi, s_flex_groups, i); + fg->frsp_tree = kzalloc(sizeof(struct ext4_frsp_tree), + GFP_KERNEL); + if (!fg->frsp_tree) + return -ENOMEM; + ext4_mb_frsp_init_tree(fg->frsp_tree, i); + } + + return 0; +} + +int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + int flex_bg_count, old_trees_count = sbi->s_mb_num_frsp_trees; + int i; + + if (!ext4_mb_frsp_on(sb)) + return 0; + + flex_bg_count = ext4_num_grps_to_flexbg(sb, ngroups); + if (old_trees_count > 0) + ext4_mb_frsp_free_tree(sb, ext4_get_frsp_tree(sb, + old_trees_count - 1)); + + for (i = old_trees_count; i < flex_bg_count; i++) { + struct flex_groups *fg; + + fg = sbi_array_rcu_deref(sbi, s_flex_groups, i); + fg->frsp_tree = kzalloc(sizeof(*fg->frsp_tree), GFP_KERNEL); + if (!fg->frsp_tree) + return -ENOMEM; + ext4_mb_frsp_init_tree(fg->frsp_tree, i); + } + sbi->s_mb_num_frsp_trees = flex_bg_count; + + return 0; +} + /* * Divide blocks started from @first with length @len into * smaller chunks with power of 2 blocks. @@ -1042,6 +1802,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + if (ext4_mb_frsp_on(sb)) + return 0; + blocks_per_page = PAGE_SIZE / sb->s_blocksize; /* * the buddy cache inode stores the block bitmap @@ -1099,6 +1862,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) struct page *page; int ret = 0; + WARN_ON(ext4_mb_frsp_on(sb)); might_sleep(); mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); @@ -1156,6 +1920,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) * Locking note: This routine calls ext4_mb_init_cache(), which takes the * block group lock of all groups for this page; do not hold the BG lock when * calling this routine! + * + * For freespace trees, do not hold tree mutex while calling this routine. + * It is okay to hold that mutex only if EXT4_ALLOCATOR_FRSP_NOLOAD flag is + * set in e4b->frsp_flags. If that flag is set, this function doesn't try + * to load an unloaded tree. */ static noinline_for_stack int ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, @@ -1166,7 +1935,7 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, int pnum; int poff; struct page *page; - int ret; + int ret = 0; struct ext4_group_info *grp; struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; @@ -1183,6 +1952,22 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + if (ext4_mb_frsp_on(sb)) { + e4b->frsp_tree = ext4_get_frsp_tree(sb, + ext4_flex_group(sbi, group)); + e4b->frsp_flags = flags; + if (flags & EXT4_ALLOCATOR_FRSP_NOLOAD) + return 0; + + mutex_lock(&e4b->frsp_tree->frsp_lock); + if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) { + mutex_unlock(&e4b->frsp_tree->frsp_lock); + return 0; + } + ret = ext4_mb_frsp_load(e4b); + mutex_unlock(&e4b->frsp_tree->frsp_lock); + return ret; + } if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { /* @@ -1305,6 +2090,8 @@ static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group, static void ext4_mb_unload_allocator(struct ext4_buddy *e4b) { + if (ext4_mb_frsp_on(e4b->bd_sb)) + return; if (e4b->bd_bitmap_page) put_page(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) @@ -1497,6 +2284,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; + if (ext4_mb_frsp_on(sb)) { + ext4_grpblk_t first_blk = EXT4_C2B(EXT4_SB(sb), first) + + ext4_group_first_block_no(sb, e4b->bd_group); + + ext4_unlock_group(sb, e4b->bd_group); + ext4_mb_frsp_free_blocks(sb, e4b->bd_group, first_blk, count); + ext4_lock_group(sb, e4b->bd_group); + return; + } + /* access memory sequentially: check left neighbour, * clear range and then check right neighbour */ @@ -1563,6 +2360,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); + if (ext4_mb_frsp_on(e4b->bd_sb)) + return 0; + buddy = mb_find_buddy(e4b, 0, &max); BUG_ON(buddy == NULL); BUG_ON(block >= max); @@ -1627,17 +2427,74 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) unsigned ret = 0; int len0 = len; void *buddy; + ext4_grpblk_t flex_offset; BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); + + e4b->bd_info->bb_free -= len; + if (e4b->bd_info->bb_first_free == start) + e4b->bd_info->bb_first_free += len; + + if (ext4_mb_frsp_on(e4b->bd_sb)) { + struct ext4_frsp_node *new; + + flex_offset = ext4_blkno_to_flex_offset(e4b->bd_sb, + ext4_group_first_block_no(e4b->bd_sb, + e4b->bd_group) + ex->fe_start); + if (!ex->fe_node) { + ex->fe_node = ext4_mb_frsp_search_by_off(e4b->bd_sb, + e4b->frsp_tree, flex_offset); + if (!ex->fe_node) + return 0; + } + /* + * Remove the node from the trees before we modify these since + * we will change the length and / or offset of this node. + */ + rb_erase_cached(&ex->fe_node->frsp_node, + &e4b->frsp_tree->frsp_offset_root); + rb_erase_cached(&ex->fe_node->frsp_len_node, + &e4b->frsp_tree->frsp_len_root); + RB_CLEAR_NODE(&ex->fe_node->frsp_node); + RB_CLEAR_NODE(&ex->fe_node->frsp_len_node); + if (flex_offset > ex->fe_node->frsp_offset) { + if (flex_offset + ex->fe_len != + ex->fe_node->frsp_offset + + ex->fe_node->frsp_len) { + /* Need to split the node */ + new = ext4_mb_frsp_alloc_node(e4b->bd_sb); + if (!new) + return -ENOMEM; + new->frsp_offset = flex_offset + ex->fe_len; + new->frsp_len = (ex->fe_node->frsp_offset + + ex->fe_node->frsp_len) - + new->frsp_offset; + ext4_mb_frsp_insert_len(e4b->frsp_tree, new); + ext4_mb_frsp_insert_off(e4b->frsp_tree, new); + } + ex->fe_node->frsp_len = flex_offset - + ex->fe_node->frsp_offset; + } else if (ex->fe_len < ex->fe_node->frsp_len) { + /* used only a part: update node */ + ex->fe_node->frsp_offset += ex->fe_len; + ex->fe_node->frsp_len -= ex->fe_len; + } else { + ext4_mb_frsp_free_node(e4b->bd_sb, ex->fe_node); + return 0; + } + + ext4_mb_frsp_insert_len(e4b->frsp_tree, ex->fe_node); + ext4_mb_frsp_insert_off(e4b->frsp_tree, ex->fe_node); + + return 0; + } + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); this_cpu_inc(discard_pa_seq); - e4b->bd_info->bb_free -= len; - if (e4b->bd_info->bb_first_free == start) - e4b->bd_info->bb_first_free += len; /* let's maintain fragments counter */ if (start != 0) @@ -2778,6 +3635,13 @@ static int ext4_mb_init_backend(struct super_block *sb) rcu_read_lock(); kvfree(rcu_dereference(sbi->s_group_info)); rcu_read_unlock(); + if (ext4_mb_frsp_on(sb)) { + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { + struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i); + + kfree(tree); + } + } return -ENOMEM; } @@ -2874,6 +3738,13 @@ int ext4_mb_init(struct super_block *sb) i++; } while (i <= sb->s_blocksize_bits + 1); + /* init for freespace trees */ + if (ext4_mb_frsp_on(sb)) { + ret = ext4_mb_init_freespace_trees(sb); + if (ret) + goto out; + } + spin_lock_init(&sbi->s_md_lock); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; @@ -2940,6 +3811,13 @@ int ext4_mb_init(struct super_block *sb) sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); sbi->s_mb_maxs = NULL; + if (ext4_mb_frsp_on(sb)) { + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { + struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i); + + kfree(tree); + } + } return ret; } @@ -3084,8 +3962,10 @@ static void ext4_free_data_in_buddy(struct super_block *sb, /* No more items in the per group rb tree * balance refcounts from ext4_mb_free_metadata() */ - put_page(e4b.bd_buddy_page); - put_page(e4b.bd_bitmap_page); + if (!ext4_mb_frsp_on(sb)) { + put_page(e4b.bd_buddy_page); + put_page(e4b.bd_bitmap_page); + } } ext4_unlock_group(sb, entry->efd_group); kmem_cache_free(ext4_free_data_cachep, entry); @@ -3163,9 +4043,14 @@ int __init ext4_init_mballoc(void) SLAB_RECLAIM_ACCOUNT); if (ext4_free_data_cachep == NULL) goto out_ac_free; + ext4_freespace_node_cachep = KMEM_CACHE(ext4_frsp_node, + SLAB_RECLAIM_ACCOUNT); + if (ext4_freespace_node_cachep == NULL) + goto out_frsp_free; return 0; - +out_frsp_free: + kmem_cache_destroy(ext4_free_data_cachep); out_ac_free: kmem_cache_destroy(ext4_ac_cachep); out_pa_free: @@ -3184,6 +4069,7 @@ void ext4_exit_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); kmem_cache_destroy(ext4_free_data_cachep); + kmem_cache_destroy(ext4_freespace_node_cachep); ext4_groupinfo_destroy_slabs(); } @@ -3686,6 +4572,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) return false; + if (ext4_mb_frsp_on(ac->ac_sb)) + return 0; + /* first, try per-file preallocation */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { @@ -4362,6 +5251,8 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) struct ext4_prealloc_space *pa; BUG_ON(ext4_pspace_cachep == NULL); + if (ext4_mb_frsp_on(ac->ac_sb)) + return 0; pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); if (!pa) return -ENOMEM; @@ -4374,6 +5265,8 @@ static void ext4_mb_pa_free(struct ext4_allocation_context *ac) { struct ext4_prealloc_space *pa = ac->ac_pa; + if (ext4_mb_frsp_on(ac->ac_sb)) + return; BUG_ON(!pa); ac->ac_pa = NULL; WARN_ON(!atomic_dec_and_test(&pa->pa_count)); @@ -4547,6 +5440,13 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_o_ex.fe_len = len; ac->ac_g_ex = ac->ac_o_ex; ac->ac_flags = ar->flags; + if (ext4_mb_frsp_on(sb)) + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; + + /* set up best-found tree node */ + ac->ac_b_tree_ex.te_flex_start = 0; + ac->ac_b_tree_ex.te_flex = 0; + ac->ac_b_tree_ex.te_len = 0; /* we have to define context: we'll we work with a file or * locality group. this is a policy, actually */ @@ -4867,7 +5767,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto errout; repeat: /* allocate space in core */ - *errp = ext4_mb_regular_allocator(ac); + if (ext4_mb_frsp_on(sb)) + *errp = ext4_mb_tree_allocator(ac); + else + *errp = ext4_mb_regular_allocator(ac); + /* * pa allocated above is added to grp->bb_prealloc_list only * when we were able to allocate some block i.e. when @@ -4880,8 +5784,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_discard_allocated_blocks(ac); goto errout; } - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) + /* + * Freespace trees should never return more than what was asked + * for. + */ + if (!ext4_mb_frsp_on(sb) && + (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)) ext4_mb_pa_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { @@ -4972,13 +5881,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct rb_node *parent = NULL, *new_node; BUG_ON(!ext4_handle_valid(handle)); - BUG_ON(e4b->bd_bitmap_page == NULL); - BUG_ON(e4b->bd_buddy_page == NULL); + BUG_ON(!ext4_mb_frsp_on(sb) && e4b->bd_bitmap_page == NULL); new_node = &new_entry->efd_node; cluster = new_entry->efd_start_cluster; - if (!*n) { + if (!*n && !ext4_mb_frsp_on(sb)) { /* first free block exent. We need to protect buddy cache from being freed, * otherwise we'll refresh it from @@ -5457,6 +6365,7 @@ __acquires(bitlock) ex.fe_start = start; ex.fe_group = group; ex.fe_len = count; + ex.fe_node = NULL; /* * Mark blocks used, so no one can reuse them while @@ -5496,6 +6405,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, void *bitmap; ext4_grpblk_t next, count = 0, free_count = 0; struct ext4_buddy e4b; + struct buffer_head *bh = NULL; int ret = 0; trace_ext4_trim_all_free(sb, group, start, max); @@ -5506,7 +6416,17 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, ret, group); return ret; } - bitmap = e4b.bd_bitmap; + + if (ext4_mb_frsp_on(sb)) { + bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bh)) { + ret = PTR_ERR(bh); + goto out; + } + bitmap = bh->b_data; + } else { + bitmap = e4b.bd_bitmap; + } ext4_lock_group(sb, group); if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && @@ -5553,6 +6473,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); } out: + if (!IS_ERR_OR_NULL(bh)) + brelse(bh); ext4_unlock_group(sb, group); ext4_mb_unload_allocator(&e4b); @@ -5613,7 +6535,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) for (group = first_group; group <= last_group; group++) { grp = ext4_get_group_info(sb, group); /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + if (unlikely(!ext4_mb_frsp_on(sb) && EXT4_MB_GRP_NEED_INIT(grp))) { ret = ext4_mb_init_group(sb, group, GFP_NOFS); if (ret) break; @@ -5667,16 +6589,25 @@ ext4_mballoc_query_range( ext4_grpblk_t next; struct ext4_buddy e4b; int error; + struct buffer_head *bh = NULL; - error = ext4_mb_load_allocator(sb, group, &e4b, 0); - if (error) - return error; - bitmap = e4b.bd_bitmap; - + if (ext4_mb_frsp_on(sb)) { + bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR(bh)) { + error = PTR_ERR(bh); + goto out_unload; + } + bitmap = bh->b_data; + } else { + error = ext4_mb_load_allocator(sb, group, &e4b, 0); + if (error) + return error; + bitmap = e4b.bd_bitmap; + } ext4_lock_group(sb, group); - - start = (e4b.bd_info->bb_first_free > start) ? - e4b.bd_info->bb_first_free : start; + if (!ext4_mb_frsp_on(sb)) + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; @@ -5685,19 +6616,20 @@ ext4_mballoc_query_range( if (start > end) break; next = mb_find_next_bit(bitmap, end + 1, start); - ext4_unlock_group(sb, group); error = formatter(sb, group, start, next - start, priv); if (error) goto out_unload; ext4_lock_group(sb, group); - start = next + 1; } ext4_unlock_group(sb, group); out_unload: - ext4_mb_unload_allocator(&e4b); + if (!IS_ERR_OR_NULL(bh)) + brelse(bh); + if (!ext4_mb_frsp_on(sb)) + ext4_mb_unload_allocator(&e4b); return error; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 6b4d17c2935d..32b9ee452de7 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -74,6 +74,20 @@ #define MB_DEFAULT_GROUP_PREALLOC 512 +/* + * Struct for tree node in freespace_tree + */ +struct ext4_frsp_node { + ext4_grpblk_t frsp_offset; /* Start block offset inside + * current flexible group + */ + ext4_grpblk_t frsp_len; /* + * Length of the free space in + * number of clusters + */ + struct rb_node frsp_node; + struct rb_node frsp_len_node; +}; struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; @@ -121,6 +135,7 @@ struct ext4_free_extent { ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; ext4_grpblk_t fe_len; /* In cluster units */ + struct ext4_frsp_node *fe_node; }; /* @@ -141,7 +156,14 @@ struct ext4_locality_group { spinlock_t lg_prealloc_lock; }; +struct ext4_tree_extent { + ext4_group_t te_flex; /* flex_bg index (tree index) */ + ext4_grpblk_t te_flex_start; /* block offset w.r.t flex bg */ + ext4_grpblk_t te_len; /* length */ +}; + struct ext4_allocation_context { + __u32 ac_id; struct inode *ac_inode; struct super_block *ac_sb; @@ -154,8 +176,16 @@ struct ext4_allocation_context { /* the best found extent */ struct ext4_free_extent ac_b_ex; - /* copy of the best found extent taken before preallocation efforts */ - struct ext4_free_extent ac_f_ex; + /* With freespace trees, we don't use preallocation anymore. */ + union { + /* + * copy of the best found extent taken before + * preallocation efforts + */ + struct ext4_free_extent ac_f_ex; + /* the best found tree extent */ + struct ext4_tree_extent ac_b_tree_ex; + }; __u16 ac_groups_scanned; __u16 ac_found; @@ -177,14 +207,31 @@ struct ext4_allocation_context { #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 +/* + * Freespace tree flags + */ +#define EXT4_ALLOCATOR_FRSP_NOLOAD 0x0001 /* + * Don't load freespace + * tree, if it's not + * in memory. + */ + struct ext4_buddy { - struct page *bd_buddy_page; - void *bd_buddy; - struct page *bd_bitmap_page; - void *bd_bitmap; + union { + struct { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + __u16 bd_blkbits; + }; + struct { + struct ext4_frsp_tree *frsp_tree; + __u32 frsp_flags; + }; + }; struct ext4_group_info *bd_info; struct super_block *bd_sb; - __u16 bd_blkbits; ext4_group_t bd_group; }; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index a50b51270ea9..6a0e1fc18e95 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1679,6 +1679,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (err) goto out; + err = ext4_mb_add_frsp_trees(sb, input->group + 1); + if (err) + goto out; + flex_gd.count = 1; flex_gd.groups = input; flex_gd.bg_flags = &bg_flags; @@ -2051,6 +2055,10 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) if (err) goto out; + err = ext4_mb_add_frsp_trees(sb, n_group + 1); + if (err) + goto out; + flex_gd = alloc_flex_gd(flexbg_size); if (flex_gd == NULL) { err = -ENOMEM; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 98aa12602b68..2f4b7061365f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1521,7 +1521,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, + Opt_prefetch_block_bitmaps, Opt_freespace_tree, }; static const match_table_t tokens = { @@ -1608,6 +1608,7 @@ static const match_table_t tokens = { {Opt_init_itable, "init_itable=%u"}, {Opt_init_itable, "init_itable"}, {Opt_noinit_itable, "noinit_itable"}, + {Opt_freespace_tree, "freespace_tree"}, {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, {Opt_test_dummy_encryption, "test_dummy_encryption"}, @@ -1834,6 +1835,8 @@ static const struct mount_opts { {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_freespace_tree, EXT4_MOUNT2_FREESPACE_TREE, + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, {Opt_err, 0, 0} }; @@ -4740,12 +4743,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount4a; } + if (ext4_has_feature_flex_bg(sb)) + if (!ext4_fill_flex_info(sb)) { + ext4_msg(sb, KERN_ERR, + "unable to initialize flex_bg meta info!"); + goto failed_mount5; + } + ext4_ext_init(sb); err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", err); - goto failed_mount5; + goto failed_mount6; } block = ext4_count_free_clusters(sb); @@ -4775,14 +4785,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount6; } - if (ext4_has_feature_flex_bg(sb)) - if (!ext4_fill_flex_info(sb)) { - ext4_msg(sb, KERN_ERR, - "unable to initialize " - "flex_bg meta info!"); - goto failed_mount6; - } - err = ext4_register_li_request(sb, first_not_zeroed); if (err) goto failed_mount6; @@ -4856,7 +4858,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_unregister_li_request(sb); failed_mount6: ext4_mb_release(sb); + percpu_counter_destroy(&sbi->s_freeclusters_counter); + percpu_counter_destroy(&sbi->s_freeinodes_counter); + percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_writepages_rwsem); rcu_read_lock(); + +failed_mount5: flex_groups = rcu_dereference(sbi->s_flex_groups); if (flex_groups) { for (i = 0; i < sbi->s_flex_groups_allocated; i++) @@ -4864,12 +4873,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kvfree(flex_groups); } rcu_read_unlock(); - percpu_counter_destroy(&sbi->s_freeclusters_counter); - percpu_counter_destroy(&sbi->s_freeinodes_counter); - percpu_counter_destroy(&sbi->s_dirs_counter); - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_free_rwsem(&sbi->s_writepages_rwsem); -failed_mount5: ext4_ext_release(sb); ext4_release_system_zone(sb); failed_mount4a: -- 2.28.0.220.ged08abb693-goog