Hi Harshad, Some questions bellow. > On 19 Aug 2020, at 10:30, Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> wrote: > > From: Yuexin Li <lyx1209@xxxxxxxxx> > > This patch adds a new freespace tree allocator. The allocator > organizes free space regions in a couple of in-memory rbtrees, one > sorted by length and the other by offset. We use these per-flex-bg > level trees to quickly scan length sorted free space regions and > determine the best region for a given request. This feature can be > enabled by passing "-o freespace_tree" mount option. > > Signed-off-by: Yuexin Li <lyx1209@xxxxxxxxx> > Co-developed-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> > Signed-off-by: Harshad Shirwadkar <harshadshirwadkar@xxxxxxxxx> > --- > fs/ext4/ext4.h | 45 +++ > fs/ext4/mballoc.c | 984 ++++++++++++++++++++++++++++++++++++++++++++-- > fs/ext4/mballoc.h | 61 ++- > fs/ext4/resize.c | 8 + > fs/ext4/super.c | 35 +- > 5 files changed, 1084 insertions(+), 49 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 4df6f429de1a..3bb2675d4d40 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -363,6 +363,7 @@ struct flex_groups { > atomic64_t free_clusters; > atomic_t free_inodes; > atomic_t used_dirs; > + struct ext4_frsp_tree *frsp_tree; > }; > > #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ > @@ -1197,6 +1198,12 @@ struct ext4_inode_info { > #define EXT4_MOUNT2_EXPLICIT_JOURNAL_CHECKSUM 0x00000008 /* User explicitly > specified journal checksum */ > > + > +#define EXT4_MOUNT2_FREESPACE_TREE 0x00000020 /* Enable freespace tree > + * allocator (turns buddy > + * allocator off) > + */ > + > #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ > ~EXT4_MOUNT_##opt > #define set_opt(sb, opt) EXT4_SB(sb)->s_mount_opt |= \ > @@ -1402,6 +1409,30 @@ struct ext4_super_block { > #define ext4_has_strict_mode(sbi) \ > (sbi->s_encoding_flags & EXT4_ENC_STRICT_MODE_FL) > > +/* > + * Freespace tree structure > + */ > +struct ext4_frsp_tree { > + struct rb_root_cached frsp_offset_root; /* Root for offset > + * sorted tree > + */ > + struct rb_root_cached frsp_len_root; /* Root for Length > + * sorted tree > + */ > + struct mutex frsp_lock; > + __u8 frsp_flags; > + __u32 frsp_max_free_len; /* Max free extent in > + * this flex bg > + */ > + __u32 frsp_index; /* Tree index (flex bg > + * number) > + */ > +}; > + > +/* Freespace tree flags */ > + > +/* Tree is loaded in memory */ > +#define EXT4_MB_FRSP_FLAG_LOADED 0x0001 > /* > * fourth extended-fs super-block data in memory > */ > @@ -1539,6 +1570,9 @@ struct ext4_sb_info { > struct flex_groups * __rcu *s_flex_groups; > ext4_group_t s_flex_groups_allocated; > > + /* freespace trees stuff */ > + int s_mb_num_frsp_trees; > + > /* workqueue for reserved extent conversions (buffered io) */ > struct workqueue_struct *rsv_conversion_wq; > > @@ -2653,6 +2687,7 @@ extern int ext4_init_inode_table(struct super_block *sb, > extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate); > > /* mballoc.c */ > +extern int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups); > extern const struct seq_operations ext4_mb_seq_groups_ops; > extern long ext4_mb_stats; > extern long ext4_mb_max_to_scan; > @@ -3084,6 +3119,16 @@ static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, > return block_group >> sbi->s_log_groups_per_flex; > } > > +static inline struct ext4_frsp_tree * > +ext4_get_frsp_tree(struct super_block *sb, ext4_group_t flex_bg) > +{ > + struct flex_groups *fg; > + > + fg = sbi_array_rcu_deref(EXT4_SB(sb), s_flex_groups, flex_bg); > + > + return fg->frsp_tree; > +} > + > static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) > { > return 1 << sbi->s_log_groups_per_flex; > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c > index 2d8d3d9c7918..74bdc2dcb49c 100644 > --- a/fs/ext4/mballoc.c > +++ b/fs/ext4/mballoc.c > @@ -333,6 +333,7 @@ > static struct kmem_cache *ext4_pspace_cachep; > static struct kmem_cache *ext4_ac_cachep; > static struct kmem_cache *ext4_free_data_cachep; > +static struct kmem_cache *ext4_freespace_node_cachep; > > /* We create slab caches for groupinfo data structures based on the > * superblock block size. There will be one per mounted filesystem for > @@ -352,6 +353,11 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, > ext4_group_t group); > static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); > > +static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group, > + struct ext4_buddy *e4b, int flags); > +static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex); > +static void ext4_mb_unload_allocator(struct ext4_buddy *e4b); > + > /* > * The algorithm using this percpu seq counter goes below: > * 1. We sample the percpu discard_pa_seq counter before trying for block > @@ -395,6 +401,33 @@ static inline void *mb_correct_addr_and_bit(int *bit, void *addr) > return addr; > } > > +static inline int ext4_blkno_to_flex_offset(struct super_block *sb, > + ext4_fsblk_t blkno) > +{ > + return blkno % (ext4_flex_bg_size(EXT4_SB(sb)) * > + EXT4_SB(sb)->s_blocks_per_group); > +} > + > +static inline ext4_fsblk_t ext4_flex_offset_to_blkno(struct super_block *sb, > + int flex_bg, int flex_offset) > +{ > + return flex_bg * ext4_flex_bg_size(EXT4_SB(sb)) * > + EXT4_SB(sb)->s_blocks_per_group + flex_offset; > +} > + > +static inline int ext4_mb_frsp_on(struct super_block *sb) > +{ > + return test_opt2(sb, FREESPACE_TREE) && > + EXT4_SB(sb)->s_es->s_log_groups_per_flex; > +} > + > +static inline unsigned int ext4_num_grps_to_flexbg(struct super_block *sb, > + int ngroups) > +{ > + return (ngroups + ext4_flex_bg_size(EXT4_SB(sb)) - 1) >> > + (EXT4_SB(sb)->s_es->s_log_groups_per_flex); > +} > + > static inline int mb_test_bit(int bit, void *addr) > { > /* > @@ -453,6 +486,7 @@ static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) > { > char *bb; > > + WARN_ON(ext4_mb_frsp_on(e4b->bd_sb)); > BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); > BUG_ON(max == NULL); > > @@ -620,6 +654,9 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, > void *buddy; > void *buddy2; > > + if (ext4_mb_frsp_on(sb)) > + return 0; > + > { > static int mb_check_counter; > if (mb_check_counter++ % 100 != 0) > @@ -706,6 +743,729 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, > #define mb_check_buddy(e4b) > #endif > > +/* Generic macro for inserting a new node in cached rbtree */ > +#define ext4_mb_rb_insert(__root, __new, __node_t, __entry, __cmp) do { \ > + struct rb_node **iter = &((__root)->rb_root.rb_node), *parent = NULL; \ > + bool leftmost = true; \ > + __node_t *this = NULL; \ > + \ > + while (*iter) { \ > + this = rb_entry(*iter, __node_t, __entry); \ > + parent = *iter; \ > + if (__cmp((__new), this)) { \ > + iter = &((*iter)->rb_left); \ > + } else { \ > + iter = &((*iter)->rb_right); \ > + leftmost = false; \ > + } \ > + } \ > + rb_link_node(&(__new)->__entry, parent, iter); \ > + rb_insert_color_cached(&(__new)->__entry, __root, leftmost); \ > +} while (0) Using kernel primitives from lib/rbtree.c is preferable. This lib has a lot of users and looks stable. There is rb_insert_color() there that can be used against ext4_mb_rb_insert() > + > +static inline int ext4_mb_frsp_offset_cmp(struct ext4_frsp_node *arg1, > + struct ext4_frsp_node *arg2) > +{ > + return (arg1->frsp_offset < arg2->frsp_offset); > +} > + > +/* > + * Free space extents length sorting function, the nodes are sorted in > + * decreasing order of length > + */ > +static inline int ext4_mb_frsp_len_cmp(struct ext4_frsp_node *arg1, > + struct ext4_frsp_node *arg2) > +{ > + return (arg1->frsp_len > arg2->frsp_len); > +} > + > +/* insert to offset-indexed tree */ > +static void ext4_mb_frsp_insert_off(struct ext4_frsp_tree *tree, > + struct ext4_frsp_node *new_entry) > +{ > + memset(&new_entry->frsp_node, 0, sizeof(new_entry->frsp_node)); > + ext4_mb_rb_insert(&tree->frsp_offset_root, new_entry, > + struct ext4_frsp_node, frsp_node, ext4_mb_frsp_offset_cmp); > +} > + > +/* insert to tree sorted by length */ > +static void ext4_mb_frsp_insert_len(struct ext4_frsp_tree *tree, > + struct ext4_frsp_node *new_entry) > +{ > + memset(&new_entry->frsp_len_node, 0, sizeof(new_entry->frsp_len_node)); > + ext4_mb_rb_insert(&tree->frsp_len_root, new_entry, > + struct ext4_frsp_node, frsp_len_node, > + ext4_mb_frsp_len_cmp); > +} > + > +#ifdef CONFIG_EXT4_DEBUG > +/* print freespace_tree in pre-order traversal */ > +void ext4_mb_frsp_print_tree_len(struct super_block *sb, > + struct ext4_frsp_tree *tree) > +{ > + unsigned int count = 0; > + ext4_fsblk_t blk = 0, blk_end = 0; > + ext4_group_t group = 0, group_end = 0; > + struct ext4_frsp_node *entry = NULL; > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + struct rb_node *cur; > + > + cur = rb_first_cached(&tree->frsp_len_root); > + mb_debug(sb, "\nTree\tNode\tlength\tblock\tgroup\n"); > + > + while (cur) { > + entry = rb_entry(cur, struct ext4_frsp_node, frsp_len_node); > + count++; > + blk = ext4_flex_offset_to_blkno(sb, tree->frsp_index, > + entry->frsp_offset); > + blk_end = ext4_flex_offset_to_blkno(sb, tree->frsp_index, > + entry->frsp_offset + entry->frsp_len - 1); > + group = blk / sbi->s_blocks_per_group; > + group_end = (blk_end-1) / sbi->s_blocks_per_group; > + mb_debug(sb, "%u\t%u\t%u\t%u(%lld)--%llu\t%u--%u\n", > + tree->frsp_index, count, entry->frsp_len, > + entry->frsp_offset, blk, blk_end, group, group_end); > + cur = rb_next(cur); > + } > +} > +#endif > + > +static struct ext4_frsp_node *ext4_mb_frsp_alloc_node(struct super_block *sb) > +{ > + struct ext4_frsp_node *node; > + > + node = kmem_cache_alloc(ext4_freespace_node_cachep, GFP_NOFS); > + if (!node) > + return NULL; > + > + RB_CLEAR_NODE(&node->frsp_node); > + RB_CLEAR_NODE(&node->frsp_len_node); > + > + return node; > +} > + > +static void ext4_mb_frsp_free_node(struct super_block *sb, > + struct ext4_frsp_node *node) > +{ > + kmem_cache_free(ext4_freespace_node_cachep, node); > +} > + > +/* Evict a tree from memory */ > +void ext4_mb_frsp_free_tree(struct super_block *sb, struct ext4_frsp_tree *tree) > +{ > + struct ext4_frsp_node *frsp_node; > + struct rb_node *node; > + > + mb_debug(sb, "Evicting tree %d\n", tree->frsp_index); > + mutex_lock(&tree->frsp_lock); > + if (!(tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED)) > + goto out; > + > + node = rb_first_cached(&tree->frsp_offset_root); > + while (node) { > + frsp_node = rb_entry(node, struct ext4_frsp_node, frsp_node); > + rb_erase_cached(node, &tree->frsp_offset_root); > + rb_erase_cached(&frsp_node->frsp_len_node, > + &tree->frsp_len_root); > + ext4_mb_frsp_free_node(sb, frsp_node); > + node = rb_first_cached(&tree->frsp_offset_root); > + } > + tree->frsp_flags &= ~EXT4_MB_FRSP_FLAG_LOADED; > + tree->frsp_offset_root = RB_ROOT_CACHED; > + tree->frsp_len_root = RB_ROOT_CACHED; > +out: > + mutex_unlock(&tree->frsp_lock); > +} > + > +/* > + * Search tree by flexbg offset. Returns the node containing "target". If > + * no such node is present, returns NULL. Must be called with tree mutex held. > + */ > +struct ext4_frsp_node *ext4_mb_frsp_search_by_off(struct super_block *sb, > + struct ext4_frsp_tree *tree, > + ext4_grpblk_t target) > +{ > + struct rb_root *root = &tree->frsp_offset_root.rb_root; > + struct rb_node *node = root->rb_node; > + struct ext4_frsp_node *this = NULL; > + > + while (node) { > + this = rb_entry(node, struct ext4_frsp_node, frsp_node); > + if (this->frsp_offset == target) > + return this; > + else if (target < this->frsp_offset) > + node = node->rb_left; > + else > + node = node->rb_right; > + } > + return NULL; > +} > + > +/* > + * Check if two entries in freespace tree can be merged together. Nodes > + * can merge together only if they are physically contiguous and belong > + * to the same block group. > + */ > +int ext4_mb_frsp_node_can_merge(struct super_block *sb, > + struct ext4_frsp_node *prev_entry, struct ext4_frsp_node *cur_entry) > +{ > + if (prev_entry->frsp_offset + prev_entry->frsp_len != > + cur_entry->frsp_offset) > + return 0; > + if (prev_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group != > + cur_entry->frsp_offset / EXT4_SB(sb)->s_blocks_per_group) > + return 0; > + return 1; > +} > + > +/* > + * Add new free space region to tree. We insert the new node in both, the > + * length sorted and the flex-bg offset sorted trees. Must be called with > + * tree mutex held. > + */ > +int ext4_mb_frsp_add_region(struct super_block *sb, struct ext4_frsp_tree *tree, > + ext4_grpblk_t offset, ext4_grpblk_t len) > +{ > + struct ext4_frsp_node *new_entry = NULL, *next_entry = NULL, > + *prev_entry = NULL; > + struct rb_node *left = NULL, *right = NULL; > + > + new_entry = ext4_mb_frsp_alloc_node(sb); > + if (!new_entry) > + return -ENOMEM; > + > + new_entry->frsp_offset = offset; > + new_entry->frsp_len = len; > + > + ext4_mb_frsp_insert_off(tree, new_entry); > + /* try merge to left and right */ > + /* left */ > + left = rb_prev(&new_entry->frsp_node); > + if (left) { > + prev_entry = rb_entry(left, struct ext4_frsp_node, frsp_node); > + if (ext4_mb_frsp_node_can_merge(sb, prev_entry, new_entry)) { > + new_entry->frsp_offset = prev_entry->frsp_offset; > + new_entry->frsp_len += prev_entry->frsp_len; > + rb_erase_cached(left, &tree->frsp_offset_root); > + rb_erase_cached(&prev_entry->frsp_len_node, > + &tree->frsp_len_root); > + ext4_mb_frsp_free_node(sb, prev_entry); > + } > + } > + > + /* right */ > + right = rb_next(&new_entry->frsp_node); > + if (right) { > + next_entry = rb_entry(right, struct ext4_frsp_node, frsp_node); > + if (ext4_mb_frsp_node_can_merge(sb, new_entry, next_entry)) { > + new_entry->frsp_len += next_entry->frsp_len; > + rb_erase_cached(right, &tree->frsp_offset_root); > + rb_erase_cached(&next_entry->frsp_len_node, > + &tree->frsp_len_root); > + ext4_mb_frsp_free_node(sb, next_entry); > + } > + } > + ext4_mb_frsp_insert_len(tree, new_entry); > + > + return 0; > +} > + > +/* > + * Free length number of blocks starting at block number block in free space > + * tree. > + */ > +int ext4_mb_frsp_free_blocks(struct super_block *sb, ext4_group_t group, > + ext4_grpblk_t block, ext4_grpblk_t length) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + struct ext4_frsp_tree *tree = > + ext4_get_frsp_tree(sb, ext4_flex_group(sbi, group)); > + int err = 0; > + > + mutex_lock(&tree->frsp_lock); > + err = ext4_mb_frsp_add_region(sb, tree, > + ext4_blkno_to_flex_offset(sb, block), length); > + mutex_unlock(&tree->frsp_lock); > + > + return err; > +} > + > +/* > + * Create freespace tree from on-disk bitmap. Must be called with tree mutex > + * held. Returns 0 on success, error otherwise. > + */ > +int ext4_mb_frsp_bb_to_tree(struct ext4_buddy *e4b, ext4_group_t group, > + struct buffer_head *bh) > +{ > + struct super_block *sb = e4b->bd_sb; > + int length = 0, bit = 0, next; > + int end = EXT4_SB(sb)->s_blocks_per_group; > + ext4_fsblk_t block; > + int ret = 0; > + > + /* find all unused blocks in bitmap, convert them to new tree node */ > + while (bit < end) { > + bit = mb_find_next_zero_bit(bh->b_data, end, bit); > + if (bit >= end) > + break; > + > + next = mb_find_next_bit(bh->b_data, end, bit); > + length = next - bit; > + block = ext4_group_first_block_no(sb, group) + bit; > + > + ret = ext4_mb_frsp_add_region(sb, e4b->frsp_tree, > + ext4_blkno_to_flex_offset(sb, block), length); > + if (ret) > + break; > + bit = next + 1; > + } > + > + return ret; > +} > + > +/* > + * Load one freespace_tree from disk. Assume holding mutex lock on tree. > + */ > +int ext4_mb_frsp_load(struct ext4_buddy *e4b) > +{ > + ext4_group_t ngroups, group_start, group_end, grp; > + struct ext4_sb_info *sbi = EXT4_SB(e4b->bd_sb); > + int i, ret = 0; > + struct buffer_head **bh = NULL; > + > + if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) > + return 0; > + > + group_start = e4b->frsp_tree->frsp_index * ext4_flex_bg_size(sbi); > + group_end = min(group_start + ext4_flex_bg_size(sbi), > + ext4_get_groups_count(e4b->bd_sb)) - 1; > + ngroups = group_end - group_start + 1; > + > + bh = kcalloc(ngroups, sizeof(*bh), GFP_KERNEL); > + if (!bh) > + return -ENOMEM; > + for (i = 0; i < ngroups; i++) { > + grp = i + group_start; > + bh[i] = ext4_read_block_bitmap_nowait(e4b->bd_sb, grp, false); > + if (IS_ERR(bh[i])) { > + ret = PTR_ERR(bh[i]); > + goto out; > + } > + } > + for (i = 0; i < ngroups; i++) { > + grp = i + group_start; > + if (!bitmap_uptodate(bh[i])) { > + ret = ext4_wait_block_bitmap(e4b->bd_sb, grp, bh[i]); > + if (ret) > + goto out; > + } > + ret = ext4_mb_frsp_bb_to_tree(e4b, grp, bh[i]); > + if (ret) > + goto out; > + } > + e4b->frsp_tree->frsp_flags |= EXT4_MB_FRSP_FLAG_LOADED; > +out: > + for (i = 0; i < ngroups; i++) { > + if (!IS_ERR_OR_NULL(bh[i])) > + put_bh(bh[i]); > + if (!ret && IS_ERR(bh[i])) > + ret = PTR_ERR(bh[i]); > + } > + kfree(bh); > + return ret; > + > +} > + > +/* > + * Determine if node with length len is better than what we have found until > + * now. Return 1 if that is the case, 0 otherwise. If len is exact match, > + * set *best = 1. > + */ > +static int ext4_mb_frsp_is_better(struct ext4_allocation_context *ac, > + int len, int *best) > +{ > + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; > + struct ext4_free_extent *gex = &ac->ac_g_ex; > + > + if (best) > + *best = 0; > + if (len == gex->fe_len) { > + if (best) > + *best = 1; > + return 1; > + } > + if (ac->ac_criteria == 0 && len < gex->fe_len) > + return 0; > + /* > + * See if the new node is cloer to goal length than > + * the best extent found so far > + */ > + if (btx->te_len < gex->fe_len && len > btx->te_len) > + return 1; > + if (len > gex->fe_len && len < btx->te_len) > + return 1; > + return 0; > +} > + > +/* > + * check if we have scanned sufficient freespace candidates > + * stop scanning if reached/exceeded s_max_to_scan > + */ > +static void ext4_mb_frsp_check_limits(struct ext4_allocation_context *ac) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); > + > + if (ac->ac_status == AC_STATUS_FOUND) > + return; > + > + /* > + * Exceeded max number of nodes to scan > + */ > + if (ac->ac_found > sbi->s_mb_max_to_scan && > + !(ac->ac_flags & EXT4_MB_HINT_FIRST)) > + ac->ac_status = AC_STATUS_BREAK; > +} > + > +/* > + * Mark free space in selected tree node as used and update the tree. > + * This must be called with tree lock. > + */ > +static void ext4_mb_frsp_use_best_found(struct ext4_allocation_context *ac, > + ext4_group_t flex, struct ext4_frsp_node *selected) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); > + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; > + struct ext4_free_extent *bex; > + unsigned int group_no; > + struct ext4_buddy e4b; > + > + WARN_ON(ac->ac_status == AC_STATUS_FOUND); > + btx->te_len = min(btx->te_len, ac->ac_g_ex.fe_len); > + ac->ac_status = AC_STATUS_FOUND; > + > + /* update ac best-found-extent */ > + bex = &ac->ac_b_ex; > + group_no = (btx->te_flex * ext4_flex_bg_size(sbi)) + > + (btx->te_flex_start / sbi->s_blocks_per_group); > + bex->fe_start = btx->te_flex_start % sbi->s_blocks_per_group; > + bex->fe_group = group_no; > + bex->fe_len = btx->te_len; > + bex->fe_node = selected; > + > + /* Add free blocks to our tree */ > + ext4_mb_load_allocator(ac->ac_sb, group_no, &e4b, > + EXT4_ALLOCATOR_FRSP_NOLOAD); > + mb_mark_used(&e4b, bex); > + ext4_mb_unload_allocator(&e4b); > +} > +/* > + * The routine checks whether found tree node is good enough. If it is, > + * then the tree node gets marked used and flag is set to the context > + * to stop scanning. > + * > + * Otherwise, the tree node is compared with the previous found extent and > + * if new one is better, then it's stored in the context. > + * > + * Later, the best found tree node will be used, if mballoc can't find good > + * enough extent. > + */ > +static int ext4_mb_frsp_measure_node(struct ext4_allocation_context *ac, > + int tree_idx, struct ext4_frsp_node *cur, > + int goal) > +{ > + struct ext4_tree_extent *btx = &ac->ac_b_tree_ex; > + ext4_fsblk_t start; > + int best_found = 0, max; > + > + WARN_ON(btx->te_len < 0); > + WARN_ON(ac->ac_status != AC_STATUS_CONTINUE); > + > + if (goal) { > + start = ext4_group_first_block_no(ac->ac_sb, > + ac->ac_g_ex.fe_group) + > + ac->ac_g_ex.fe_start; > + if (cur->frsp_offset > ext4_blkno_to_flex_offset(ac->ac_sb, > + start)) > + return 0; > + max = cur->frsp_offset + cur->frsp_len - > + ext4_blkno_to_flex_offset(ac->ac_sb, start); > + if (max >= ac->ac_g_ex.fe_len && > + ac->ac_g_ex.fe_len == EXT4_SB(ac->ac_sb)->s_stripe) { > + if (do_div(start, EXT4_SB(ac->ac_sb)->s_stripe) != 0) > + return 0; > + best_found = 1; > + } else if (max >= ac->ac_g_ex.fe_len) { > + best_found = 1; > + } else if (max > 0 && (ac->ac_flags & EXT4_MB_HINT_MERGE)) { > + /* > + * Sometimes, caller may want to merge even small > + * number of blocks to an existing extent > + */ > + best_found = 1; > + > + } else { > + return 0; > + } > + ac->ac_found++; > + goto use_extent; > + } > + ac->ac_found++; > + > + /* The special case - take what you catch first */ > + if (unlikely(ac->ac_flags & EXT4_MB_HINT_FIRST)) { > + best_found = 1; > + goto use_extent; > + } > + > + /* > + * Prefer not allocating blocks in first group in flex bg for data > + * blocks. > + */ > + if (unlikely((ac->ac_criteria < 2) && > + (ac->ac_flags & EXT4_MB_HINT_DATA) && > + (cur->frsp_offset < EXT4_BLOCKS_PER_GROUP(ac->ac_sb)))) > + return 1; > + > + > + /* If this is first found extent, just store it in the context */ > + if (btx->te_len == 0) > + goto use_extent; > + > + if (ext4_mb_frsp_is_better(ac, cur->frsp_len, &best_found)) > + goto use_extent; > + > + ext4_mb_frsp_check_limits(ac); > + return 0; > + > +use_extent: > + btx->te_flex = tree_idx; > + if (goal) { > + btx->te_flex_start = ext4_blkno_to_flex_offset(ac->ac_sb, > + start); > + btx->te_len = max; > + } else { > + btx->te_flex_start = cur->frsp_offset; > + btx->te_len = cur->frsp_len; > + } > + if (best_found) > + ext4_mb_frsp_use_best_found(ac, tree_idx, cur); > + ext4_mb_frsp_check_limits(ac); > + return 1; > +} > + > +/* Search by goal */ > +static noinline_for_stack > +void ext4_mb_frsp_find_by_goal(struct ext4_allocation_context *ac) > +{ > + struct ext4_frsp_node *cur = NULL; > + unsigned int tree_blk; > + ext4_fsblk_t blk; > + struct ext4_buddy e4b; > + int ret; > + > + if (!(ac->ac_flags & EXT4_MB_HINT_TRY_GOAL)) > + return; > + > + /* compute start node offset in tree */ > + blk = ext4_group_first_block_no(ac->ac_sb, ac->ac_g_ex.fe_group) + > + ac->ac_g_ex.fe_start; > + tree_blk = ext4_blkno_to_flex_offset(ac->ac_sb, blk); > + > + ret = ext4_mb_load_allocator(ac->ac_sb, ac->ac_g_ex.fe_group, &e4b, 0); > + if (ret) > + return; > + > + /* try goal block and its freespace_tree first */ > + mutex_lock(&e4b.frsp_tree->frsp_lock); > + cur = ext4_mb_frsp_search_by_off(ac->ac_sb, e4b.frsp_tree, tree_blk); > + if (cur && tree_blk < cur->frsp_offset + cur->frsp_len - 1) > + ext4_mb_frsp_measure_node(ac, e4b.frsp_tree->frsp_index, cur, > + 1 /* searching by goal */); > + > + mutex_unlock(&e4b.frsp_tree->frsp_lock); > + ext4_mb_unload_allocator(&e4b); > +} > + > +static void ext4_mb_frsp_process(struct ext4_allocation_context *ac, > + struct ext4_frsp_tree *tree) > +{ > + struct ext4_frsp_node *cur = NULL; > + struct rb_node *node = NULL; > + int ret; > + > + node = rb_first_cached(&tree->frsp_len_root); > + mb_debug(ac->ac_sb, "Processing tree index %d, flags = %x\n", > + tree->frsp_index, tree->frsp_flags); > + /* > + * In order to serve the "No data blocks in first group in a flexbg" > + * requirement, we cannot do a O(Log N) search here. TODO: it's possible > + * to that at CR=2, where that requirement doesn't apply. > + */ > + while (node && ac->ac_status == AC_STATUS_CONTINUE) { > + cur = rb_entry(node, struct ext4_frsp_node, frsp_len_node); > + if (ac->ac_criteria == 0 && cur->frsp_len < ac->ac_g_ex.fe_len) > + return; > + ret = ext4_mb_frsp_measure_node(ac, tree->frsp_index, cur, > + 0 /* searching by len */); > + if (ret == 0) > + return; > + node = rb_next(node); > + } > +} > + > +/* allocator for freespace_tree */ > +static noinline_for_stack int > +ext4_mb_tree_allocator(struct ext4_allocation_context *ac) > +{ > + struct ext4_buddy e4b; > + struct ext4_sb_info *sbi; > + struct super_block *sb; > + struct ext4_frsp_tree *tree = NULL; > + struct ext4_frsp_node *cur = NULL; > + struct ext4_tree_extent *btx = NULL; > + int ret = 0, start_idx = 0, tree_idx, j; > + > + sb = ac->ac_sb; > + btx = &ac->ac_b_tree_ex; > + sbi = EXT4_SB(sb); > + > + start_idx = ext4_flex_group(sbi, ac->ac_g_ex.fe_group); > + mb_debug(sb, "requested size %d\n", ac->ac_g_ex.fe_len); > + /* First try searching from goal blk in start-idx-th freespace tree */ > + ext4_mb_frsp_find_by_goal(ac); > + if (ac->ac_status == AC_STATUS_FOUND) > + goto out; > + > + if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)) > + goto out; > + > + ac->ac_criteria = 0; > + ac->ac_groups_scanned = 0; > +repeat: > + > + /* Loop through the rest of trees (flex_bg) */ > + for (j = start_idx; > + (ac->ac_groups_scanned < sbi->s_mb_num_frsp_trees) && > + ac->ac_status == AC_STATUS_CONTINUE; j++) { > + ac->ac_groups_scanned++; > + tree_idx = (j % sbi->s_mb_num_frsp_trees); > + > + ret = ext4_mb_load_allocator(sb, > + tree_idx * ext4_flex_bg_size(sbi), &e4b, 0); > + if (ret) > + goto out; > + mutex_lock(&e4b.frsp_tree->frsp_lock); > + ext4_mb_frsp_process(ac, e4b.frsp_tree); > + mutex_unlock(&e4b.frsp_tree->frsp_lock); > + ext4_mb_unload_allocator(&e4b); > + } > + > + if (ac->ac_status != AC_STATUS_FOUND) { > + if (ac->ac_criteria < 2) { > + ac->ac_criteria++; > + ac->ac_groups_scanned = 0; > + mb_debug(sb, "Falling back to CR=%d", ac->ac_criteria); > + goto repeat; > + } > + if (btx->te_len > 0 && !(ac->ac_flags & EXT4_MB_HINT_FIRST)) { > + e4b.frsp_flags = 0; > + ret = ext4_mb_load_allocator(sb, btx->te_flex * > + ext4_flex_bg_size(sbi), &e4b, 0); > + if (ret) > + goto out; > + tree = e4b.frsp_tree; > + mutex_lock(&tree->frsp_lock); > + cur = ext4_mb_frsp_search_by_off(sb, tree, > + btx->te_flex_start); > + if (cur) { > + ext4_mb_frsp_use_best_found(ac, btx->te_flex, > + cur); > + mutex_unlock(&tree->frsp_lock); > + ext4_mb_unload_allocator(&e4b); > + > + } else { > + /* > + * Someone else took this freespace node before > + * us. Reset the best-found tree extent, and > + * turn on FIRST HINT (greedy). > + */ > + mutex_unlock(&tree->frsp_lock); > + ac->ac_b_tree_ex.te_flex_start = 0; > + ac->ac_b_tree_ex.te_flex = 0; > + ac->ac_b_tree_ex.te_len = 0; > + ac->ac_status = AC_STATUS_CONTINUE; > + ac->ac_flags |= EXT4_MB_HINT_FIRST; > + ac->ac_groups_scanned = 0; > + ext4_mb_unload_allocator(&e4b); > + goto repeat; > + } > + } > + } > + > + ret = btx->te_len ? 0 : -ENOSPC; > + > +out: > + return ret; > +} > + > +static void ext4_mb_frsp_init_tree(struct ext4_frsp_tree *tree, int index) > +{ > + tree->frsp_offset_root = RB_ROOT_CACHED; > + tree->frsp_len_root = RB_ROOT_CACHED; > + mutex_init(&(tree->frsp_lock)); > + tree->frsp_flags = 0; > + tree->frsp_index = index; > +} > + > +int ext4_mb_init_freespace_trees(struct super_block *sb) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + struct flex_groups *fg; > + int i; > + > + sbi->s_mb_num_frsp_trees = > + ext4_num_grps_to_flexbg(sb, ext4_get_groups_count(sb)); > + > + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { > + fg = sbi_array_rcu_deref(sbi, s_flex_groups, i); > + fg->frsp_tree = kzalloc(sizeof(struct ext4_frsp_tree), > + GFP_KERNEL); > + if (!fg->frsp_tree) > + return -ENOMEM; > + ext4_mb_frsp_init_tree(fg->frsp_tree, i); > + } > + > + return 0; > +} > + > +int ext4_mb_add_frsp_trees(struct super_block *sb, int ngroups) > +{ > + struct ext4_sb_info *sbi = EXT4_SB(sb); > + int flex_bg_count, old_trees_count = sbi->s_mb_num_frsp_trees; > + int i; > + > + if (!ext4_mb_frsp_on(sb)) > + return 0; > + > + flex_bg_count = ext4_num_grps_to_flexbg(sb, ngroups); > + if (old_trees_count > 0) > + ext4_mb_frsp_free_tree(sb, ext4_get_frsp_tree(sb, > + old_trees_count - 1)); > + > + for (i = old_trees_count; i < flex_bg_count; i++) { > + struct flex_groups *fg; > + > + fg = sbi_array_rcu_deref(sbi, s_flex_groups, i); > + fg->frsp_tree = kzalloc(sizeof(*fg->frsp_tree), GFP_KERNEL); > + if (!fg->frsp_tree) > + return -ENOMEM; > + ext4_mb_frsp_init_tree(fg->frsp_tree, i); > + } > + sbi->s_mb_num_frsp_trees = flex_bg_count; > + > + return 0; > +} > + > /* > * Divide blocks started from @first with length @len into > * smaller chunks with power of 2 blocks. > @@ -1042,6 +1802,9 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb, > e4b->bd_buddy_page = NULL; > e4b->bd_bitmap_page = NULL; > > + if (ext4_mb_frsp_on(sb)) > + return 0; > + > blocks_per_page = PAGE_SIZE / sb->s_blocksize; > /* > * the buddy cache inode stores the block bitmap > @@ -1099,6 +1862,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) > struct page *page; > int ret = 0; > > + WARN_ON(ext4_mb_frsp_on(sb)); > might_sleep(); > mb_debug(sb, "init group %u\n", group); > this_grp = ext4_get_group_info(sb, group); > @@ -1156,6 +1920,11 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) > * Locking note: This routine calls ext4_mb_init_cache(), which takes the > * block group lock of all groups for this page; do not hold the BG lock when > * calling this routine! > + * > + * For freespace trees, do not hold tree mutex while calling this routine. > + * It is okay to hold that mutex only if EXT4_ALLOCATOR_FRSP_NOLOAD flag is > + * set in e4b->frsp_flags. If that flag is set, this function doesn't try > + * to load an unloaded tree. > */ > static noinline_for_stack int > ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, > @@ -1166,7 +1935,7 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, > int pnum; > int poff; > struct page *page; > - int ret; > + int ret = 0; > struct ext4_group_info *grp; > struct ext4_sb_info *sbi = EXT4_SB(sb); > struct inode *inode = sbi->s_buddy_cache; > @@ -1183,6 +1952,22 @@ ext4_mb_load_allocator_gfp(struct super_block *sb, ext4_group_t group, > e4b->bd_group = group; > e4b->bd_buddy_page = NULL; > e4b->bd_bitmap_page = NULL; > + if (ext4_mb_frsp_on(sb)) { > + e4b->frsp_tree = ext4_get_frsp_tree(sb, > + ext4_flex_group(sbi, group)); > + e4b->frsp_flags = flags; > + if (flags & EXT4_ALLOCATOR_FRSP_NOLOAD) > + return 0; > + > + mutex_lock(&e4b->frsp_tree->frsp_lock); > + if (e4b->frsp_tree->frsp_flags & EXT4_MB_FRSP_FLAG_LOADED) { > + mutex_unlock(&e4b->frsp_tree->frsp_lock); > + return 0; > + } > + ret = ext4_mb_frsp_load(e4b); > + mutex_unlock(&e4b->frsp_tree->frsp_lock); > + return ret; > + } > > if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { > /* > @@ -1305,6 +2090,8 @@ static int ext4_mb_load_allocator(struct super_block *sb, ext4_group_t group, > > static void ext4_mb_unload_allocator(struct ext4_buddy *e4b) > { > + if (ext4_mb_frsp_on(e4b->bd_sb)) > + return; > if (e4b->bd_bitmap_page) > put_page(e4b->bd_bitmap_page); > if (e4b->bd_buddy_page) > @@ -1497,6 +2284,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, > if (first < e4b->bd_info->bb_first_free) > e4b->bd_info->bb_first_free = first; > > + if (ext4_mb_frsp_on(sb)) { > + ext4_grpblk_t first_blk = EXT4_C2B(EXT4_SB(sb), first) + > + ext4_group_first_block_no(sb, e4b->bd_group); > + > + ext4_unlock_group(sb, e4b->bd_group); > + ext4_mb_frsp_free_blocks(sb, e4b->bd_group, first_blk, count); > + ext4_lock_group(sb, e4b->bd_group); > + return; > + } > + > /* access memory sequentially: check left neighbour, > * clear range and then check right neighbour > */ > @@ -1563,6 +2360,9 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block, > assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); > BUG_ON(ex == NULL); > > + if (ext4_mb_frsp_on(e4b->bd_sb)) > + return 0; > + > buddy = mb_find_buddy(e4b, 0, &max); > BUG_ON(buddy == NULL); > BUG_ON(block >= max); > @@ -1627,17 +2427,74 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) > unsigned ret = 0; > int len0 = len; > void *buddy; > + ext4_grpblk_t flex_offset; > > BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); > BUG_ON(e4b->bd_group != ex->fe_group); > + > + e4b->bd_info->bb_free -= len; > + if (e4b->bd_info->bb_first_free == start) > + e4b->bd_info->bb_first_free += len; > + > + if (ext4_mb_frsp_on(e4b->bd_sb)) { > + struct ext4_frsp_node *new; > + > + flex_offset = ext4_blkno_to_flex_offset(e4b->bd_sb, > + ext4_group_first_block_no(e4b->bd_sb, > + e4b->bd_group) + ex->fe_start); > + if (!ex->fe_node) { > + ex->fe_node = ext4_mb_frsp_search_by_off(e4b->bd_sb, > + e4b->frsp_tree, flex_offset); > + if (!ex->fe_node) > + return 0; > + } > + /* > + * Remove the node from the trees before we modify these since > + * we will change the length and / or offset of this node. > + */ > + rb_erase_cached(&ex->fe_node->frsp_node, > + &e4b->frsp_tree->frsp_offset_root); > + rb_erase_cached(&ex->fe_node->frsp_len_node, > + &e4b->frsp_tree->frsp_len_root); > + RB_CLEAR_NODE(&ex->fe_node->frsp_node); > + RB_CLEAR_NODE(&ex->fe_node->frsp_len_node); > + if (flex_offset > ex->fe_node->frsp_offset) { > + if (flex_offset + ex->fe_len != > + ex->fe_node->frsp_offset + > + ex->fe_node->frsp_len) { > + /* Need to split the node */ > + new = ext4_mb_frsp_alloc_node(e4b->bd_sb); > + if (!new) > + return -ENOMEM; > + new->frsp_offset = flex_offset + ex->fe_len; > + new->frsp_len = (ex->fe_node->frsp_offset + > + ex->fe_node->frsp_len) - > + new->frsp_offset; > + ext4_mb_frsp_insert_len(e4b->frsp_tree, new); > + ext4_mb_frsp_insert_off(e4b->frsp_tree, new); > + } > + ex->fe_node->frsp_len = flex_offset - > + ex->fe_node->frsp_offset; > + } else if (ex->fe_len < ex->fe_node->frsp_len) { > + /* used only a part: update node */ > + ex->fe_node->frsp_offset += ex->fe_len; > + ex->fe_node->frsp_len -= ex->fe_len; > + } else { > + ext4_mb_frsp_free_node(e4b->bd_sb, ex->fe_node); > + return 0; > + } > + > + ext4_mb_frsp_insert_len(e4b->frsp_tree, ex->fe_node); > + ext4_mb_frsp_insert_off(e4b->frsp_tree, ex->fe_node); > + > + return 0; > + } > + > assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); > mb_check_buddy(e4b); > mb_mark_used_double(e4b, start, len); > > this_cpu_inc(discard_pa_seq); > - e4b->bd_info->bb_free -= len; > - if (e4b->bd_info->bb_first_free == start) > - e4b->bd_info->bb_first_free += len; > > /* let's maintain fragments counter */ > if (start != 0) > @@ -2778,6 +3635,13 @@ static int ext4_mb_init_backend(struct super_block *sb) > rcu_read_lock(); > kvfree(rcu_dereference(sbi->s_group_info)); > rcu_read_unlock(); > + if (ext4_mb_frsp_on(sb)) { > + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { > + struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i); > + > + kfree(tree); > + } > + } > return -ENOMEM; > } > > @@ -2874,6 +3738,13 @@ int ext4_mb_init(struct super_block *sb) > i++; > } while (i <= sb->s_blocksize_bits + 1); > > + /* init for freespace trees */ > + if (ext4_mb_frsp_on(sb)) { > + ret = ext4_mb_init_freespace_trees(sb); > + if (ret) > + goto out; > + } > + > spin_lock_init(&sbi->s_md_lock); > spin_lock_init(&sbi->s_bal_lock); > sbi->s_mb_free_pending = 0; > @@ -2940,6 +3811,13 @@ int ext4_mb_init(struct super_block *sb) > sbi->s_mb_offsets = NULL; > kfree(sbi->s_mb_maxs); > sbi->s_mb_maxs = NULL; > + if (ext4_mb_frsp_on(sb)) { > + for (i = 0; i < sbi->s_mb_num_frsp_trees; i++) { > + struct ext4_frsp_tree *tree = ext4_get_frsp_tree(sb, i); > + > + kfree(tree); > + } > + } > return ret; > } > > @@ -3084,8 +3962,10 @@ static void ext4_free_data_in_buddy(struct super_block *sb, > /* No more items in the per group rb tree > * balance refcounts from ext4_mb_free_metadata() > */ > - put_page(e4b.bd_buddy_page); > - put_page(e4b.bd_bitmap_page); > + if (!ext4_mb_frsp_on(sb)) { > + put_page(e4b.bd_buddy_page); > + put_page(e4b.bd_bitmap_page); > + } > } > ext4_unlock_group(sb, entry->efd_group); > kmem_cache_free(ext4_free_data_cachep, entry); > @@ -3163,9 +4043,14 @@ int __init ext4_init_mballoc(void) > SLAB_RECLAIM_ACCOUNT); > if (ext4_free_data_cachep == NULL) > goto out_ac_free; > + ext4_freespace_node_cachep = KMEM_CACHE(ext4_frsp_node, > + SLAB_RECLAIM_ACCOUNT); > + if (ext4_freespace_node_cachep == NULL) > + goto out_frsp_free; > > return 0; > - > +out_frsp_free: > + kmem_cache_destroy(ext4_free_data_cachep); > out_ac_free: > kmem_cache_destroy(ext4_ac_cachep); > out_pa_free: > @@ -3184,6 +4069,7 @@ void ext4_exit_mballoc(void) > kmem_cache_destroy(ext4_pspace_cachep); > kmem_cache_destroy(ext4_ac_cachep); > kmem_cache_destroy(ext4_free_data_cachep); > + kmem_cache_destroy(ext4_freespace_node_cachep); > ext4_groupinfo_destroy_slabs(); > } > > @@ -3686,6 +4572,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) > if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) > return false; > > + if (ext4_mb_frsp_on(ac->ac_sb)) > + return 0; > + > /* first, try per-file preallocation */ > rcu_read_lock(); > list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { > @@ -4362,6 +5251,8 @@ static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) > struct ext4_prealloc_space *pa; > > BUG_ON(ext4_pspace_cachep == NULL); > + if (ext4_mb_frsp_on(ac->ac_sb)) > + return 0; > pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); > if (!pa) > return -ENOMEM; > @@ -4374,6 +5265,8 @@ static void ext4_mb_pa_free(struct ext4_allocation_context *ac) > { > struct ext4_prealloc_space *pa = ac->ac_pa; > > + if (ext4_mb_frsp_on(ac->ac_sb)) > + return; > BUG_ON(!pa); > ac->ac_pa = NULL; > WARN_ON(!atomic_dec_and_test(&pa->pa_count)); > @@ -4547,6 +5440,13 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, > ac->ac_o_ex.fe_len = len; > ac->ac_g_ex = ac->ac_o_ex; > ac->ac_flags = ar->flags; > + if (ext4_mb_frsp_on(sb)) > + ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC; > + > + /* set up best-found tree node */ > + ac->ac_b_tree_ex.te_flex_start = 0; > + ac->ac_b_tree_ex.te_flex = 0; > + ac->ac_b_tree_ex.te_len = 0; > > /* we have to define context: we'll we work with a file or > * locality group. this is a policy, actually */ > @@ -4867,7 +5767,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, > goto errout; > repeat: > /* allocate space in core */ > - *errp = ext4_mb_regular_allocator(ac); > + if (ext4_mb_frsp_on(sb)) > + *errp = ext4_mb_tree_allocator(ac); > + else > + *errp = ext4_mb_regular_allocator(ac); > + > /* > * pa allocated above is added to grp->bb_prealloc_list only > * when we were able to allocate some block i.e. when > @@ -4880,8 +5784,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, > ext4_discard_allocated_blocks(ac); > goto errout; > } > - if (ac->ac_status == AC_STATUS_FOUND && > - ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) > + /* > + * Freespace trees should never return more than what was asked > + * for. > + */ > + if (!ext4_mb_frsp_on(sb) && > + (ac->ac_status == AC_STATUS_FOUND && > + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)) > ext4_mb_pa_free(ac); > } > if (likely(ac->ac_status == AC_STATUS_FOUND)) { > @@ -4972,13 +5881,12 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, > struct rb_node *parent = NULL, *new_node; > > BUG_ON(!ext4_handle_valid(handle)); > - BUG_ON(e4b->bd_bitmap_page == NULL); > - BUG_ON(e4b->bd_buddy_page == NULL); > + BUG_ON(!ext4_mb_frsp_on(sb) && e4b->bd_bitmap_page == NULL); > > new_node = &new_entry->efd_node; > cluster = new_entry->efd_start_cluster; > > - if (!*n) { > + if (!*n && !ext4_mb_frsp_on(sb)) { > /* first free block exent. We need to > protect buddy cache from being freed, > * otherwise we'll refresh it from > @@ -5457,6 +6365,7 @@ __acquires(bitlock) > ex.fe_start = start; > ex.fe_group = group; > ex.fe_len = count; > + ex.fe_node = NULL; > > /* > * Mark blocks used, so no one can reuse them while > @@ -5496,6 +6405,7 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, > void *bitmap; > ext4_grpblk_t next, count = 0, free_count = 0; > struct ext4_buddy e4b; > + struct buffer_head *bh = NULL; > int ret = 0; > > trace_ext4_trim_all_free(sb, group, start, max); > @@ -5506,7 +6416,17 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, > ret, group); > return ret; > } > - bitmap = e4b.bd_bitmap; > + > + if (ext4_mb_frsp_on(sb)) { > + bh = ext4_read_block_bitmap(sb, group); > + if (IS_ERR(bh)) { > + ret = PTR_ERR(bh); > + goto out; > + } > + bitmap = bh->b_data; > + } else { > + bitmap = e4b.bd_bitmap; > + } > > ext4_lock_group(sb, group); > if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) && > @@ -5553,6 +6473,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group, > EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info); > } > out: > + if (!IS_ERR_OR_NULL(bh)) > + brelse(bh); > ext4_unlock_group(sb, group); > ext4_mb_unload_allocator(&e4b); > > @@ -5613,7 +6535,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) > for (group = first_group; group <= last_group; group++) { > grp = ext4_get_group_info(sb, group); > /* We only do this if the grp has never been initialized */ > - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { > + if (unlikely(!ext4_mb_frsp_on(sb) && EXT4_MB_GRP_NEED_INIT(grp))) { > ret = ext4_mb_init_group(sb, group, GFP_NOFS); > if (ret) > break; > @@ -5667,16 +6589,25 @@ ext4_mballoc_query_range( > ext4_grpblk_t next; > struct ext4_buddy e4b; > int error; > + struct buffer_head *bh = NULL; > > - error = ext4_mb_load_allocator(sb, group, &e4b, 0); > - if (error) > - return error; > - bitmap = e4b.bd_bitmap; > - > + if (ext4_mb_frsp_on(sb)) { > + bh = ext4_read_block_bitmap(sb, group); > + if (IS_ERR(bh)) { > + error = PTR_ERR(bh); > + goto out_unload; > + } > + bitmap = bh->b_data; > + } else { > + error = ext4_mb_load_allocator(sb, group, &e4b, 0); > + if (error) > + return error; > + bitmap = e4b.bd_bitmap; > + } > ext4_lock_group(sb, group); > - > - start = (e4b.bd_info->bb_first_free > start) ? > - e4b.bd_info->bb_first_free : start; > + if (!ext4_mb_frsp_on(sb)) > + start = (e4b.bd_info->bb_first_free > start) ? > + e4b.bd_info->bb_first_free : start; > if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) > end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; > > @@ -5685,19 +6616,20 @@ ext4_mballoc_query_range( > if (start > end) > break; > next = mb_find_next_bit(bitmap, end + 1, start); > - > ext4_unlock_group(sb, group); > error = formatter(sb, group, start, next - start, priv); > if (error) > goto out_unload; > ext4_lock_group(sb, group); > - > start = next + 1; > } > > ext4_unlock_group(sb, group); > out_unload: > - ext4_mb_unload_allocator(&e4b); > + if (!IS_ERR_OR_NULL(bh)) > + brelse(bh); > + if (!ext4_mb_frsp_on(sb)) > + ext4_mb_unload_allocator(&e4b); > > return error; > } > diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h > index 6b4d17c2935d..32b9ee452de7 100644 > --- a/fs/ext4/mballoc.h > +++ b/fs/ext4/mballoc.h > @@ -74,6 +74,20 @@ > #define MB_DEFAULT_GROUP_PREALLOC 512 > > > +/* > + * Struct for tree node in freespace_tree > + */ > +struct ext4_frsp_node { > + ext4_grpblk_t frsp_offset; /* Start block offset inside > + * current flexible group > + */ > + ext4_grpblk_t frsp_len; /* > + * Length of the free space in > + * number of clusters > + */ > + struct rb_node frsp_node; > + struct rb_node frsp_len_node; > +}; > struct ext4_free_data { > /* this links the free block information from sb_info */ > struct list_head efd_list; > @@ -121,6 +135,7 @@ struct ext4_free_extent { > ext4_grpblk_t fe_start; /* In cluster units */ > ext4_group_t fe_group; > ext4_grpblk_t fe_len; /* In cluster units */ > + struct ext4_frsp_node *fe_node; > }; > > /* > @@ -141,7 +156,14 @@ struct ext4_locality_group { > spinlock_t lg_prealloc_lock; > }; > > +struct ext4_tree_extent { > + ext4_group_t te_flex; /* flex_bg index (tree index) */ > + ext4_grpblk_t te_flex_start; /* block offset w.r.t flex bg */ > + ext4_grpblk_t te_len; /* length */ > +}; > + > struct ext4_allocation_context { > + __u32 ac_id; > struct inode *ac_inode; > struct super_block *ac_sb; > > @@ -154,8 +176,16 @@ struct ext4_allocation_context { > /* the best found extent */ > struct ext4_free_extent ac_b_ex; > > - /* copy of the best found extent taken before preallocation efforts */ > - struct ext4_free_extent ac_f_ex; > + /* With freespace trees, we don't use preallocation anymore. */ > + union { > + /* > + * copy of the best found extent taken before > + * preallocation efforts > + */ > + struct ext4_free_extent ac_f_ex; > + /* the best found tree extent */ > + struct ext4_tree_extent ac_b_tree_ex; > + }; > > __u16 ac_groups_scanned; > __u16 ac_found; > @@ -177,14 +207,31 @@ struct ext4_allocation_context { > #define AC_STATUS_FOUND 2 > #define AC_STATUS_BREAK 3 > > +/* > + * Freespace tree flags > + */ > +#define EXT4_ALLOCATOR_FRSP_NOLOAD 0x0001 /* > + * Don't load freespace > + * tree, if it's not > + * in memory. > + */ > + > struct ext4_buddy { > - struct page *bd_buddy_page; > - void *bd_buddy; > - struct page *bd_bitmap_page; > - void *bd_bitmap; > + union { > + struct { > + struct page *bd_buddy_page; > + void *bd_buddy; > + struct page *bd_bitmap_page; > + void *bd_bitmap; > + __u16 bd_blkbits; > + }; > + struct { > + struct ext4_frsp_tree *frsp_tree; > + __u32 frsp_flags; > + }; > + }; > struct ext4_group_info *bd_info; > struct super_block *bd_sb; > - __u16 bd_blkbits; > ext4_group_t bd_group; > }; > > diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c > index a50b51270ea9..6a0e1fc18e95 100644 > --- a/fs/ext4/resize.c > +++ b/fs/ext4/resize.c > @@ -1679,6 +1679,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) > if (err) > goto out; > > + err = ext4_mb_add_frsp_trees(sb, input->group + 1); > + if (err) > + goto out; > + > flex_gd.count = 1; > flex_gd.groups = input; > flex_gd.bg_flags = &bg_flags; > @@ -2051,6 +2055,10 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count) > if (err) > goto out; > > + err = ext4_mb_add_frsp_trees(sb, n_group + 1); > + if (err) > + goto out; > + > flex_gd = alloc_flex_gd(flexbg_size); > if (flex_gd == NULL) { > err = -ENOMEM; > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 98aa12602b68..2f4b7061365f 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1521,7 +1521,7 @@ enum { > Opt_dioread_nolock, Opt_dioread_lock, > Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, > Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, > - Opt_prefetch_block_bitmaps, > + Opt_prefetch_block_bitmaps, Opt_freespace_tree, > }; > > static const match_table_t tokens = { > @@ -1608,6 +1608,7 @@ static const match_table_t tokens = { > {Opt_init_itable, "init_itable=%u"}, > {Opt_init_itable, "init_itable"}, > {Opt_noinit_itable, "noinit_itable"}, > + {Opt_freespace_tree, "freespace_tree"}, > {Opt_max_dir_size_kb, "max_dir_size_kb=%u"}, > {Opt_test_dummy_encryption, "test_dummy_encryption=%s"}, > {Opt_test_dummy_encryption, "test_dummy_encryption"}, > @@ -1834,6 +1835,8 @@ static const struct mount_opts { > {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, > {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, > MOPT_SET}, > + {Opt_freespace_tree, EXT4_MOUNT2_FREESPACE_TREE, > + MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, > {Opt_err, 0, 0} > }; > > @@ -4740,12 +4743,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > goto failed_mount4a; > } > Flex bg is required. This requirement needs to be placed in documentation. Also mkfs should prevent from creating partition with this new feature, but without flex_bg. Same for tunefs. Harshad, are you going to share e2fsprogs with the new option support soon? It would be great to have it for debugging purpose. > + if (ext4_has_feature_flex_bg(sb)) > + if (!ext4_fill_flex_info(sb)) { > + ext4_msg(sb, KERN_ERR, > + "unable to initialize flex_bg meta info!"); > + goto failed_mount5; > + } > + > ext4_ext_init(sb); > err = ext4_mb_init(sb); > if (err) { > ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", > err); > - goto failed_mount5; > + goto failed_mount6; > } > > block = ext4_count_free_clusters(sb); > @@ -4775,14 +4785,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > goto failed_mount6; > } > > - if (ext4_has_feature_flex_bg(sb)) > - if (!ext4_fill_flex_info(sb)) { > - ext4_msg(sb, KERN_ERR, > - "unable to initialize " > - "flex_bg meta info!"); > - goto failed_mount6; > - } > - > err = ext4_register_li_request(sb, first_not_zeroed); > if (err) > goto failed_mount6; > @@ -4856,7 +4858,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > ext4_unregister_li_request(sb); > failed_mount6: > ext4_mb_release(sb); > + percpu_counter_destroy(&sbi->s_freeclusters_counter); > + percpu_counter_destroy(&sbi->s_freeinodes_counter); > + percpu_counter_destroy(&sbi->s_dirs_counter); > + percpu_counter_destroy(&sbi->s_dirtyclusters_counter); > + percpu_free_rwsem(&sbi->s_writepages_rwsem); > rcu_read_lock(); > + > +failed_mount5: > flex_groups = rcu_dereference(sbi->s_flex_groups); > if (flex_groups) { > for (i = 0; i < sbi->s_flex_groups_allocated; i++) > @@ -4864,12 +4873,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > kvfree(flex_groups); > } > rcu_read_unlock(); > - percpu_counter_destroy(&sbi->s_freeclusters_counter); > - percpu_counter_destroy(&sbi->s_freeinodes_counter); > - percpu_counter_destroy(&sbi->s_dirs_counter); > - percpu_counter_destroy(&sbi->s_dirtyclusters_counter); > - percpu_free_rwsem(&sbi->s_writepages_rwsem); > -failed_mount5: > ext4_ext_release(sb); > ext4_release_system_zone(sb); > failed_mount4a: > -- > 2.28.0.220.ged08abb693-goog Is it possible to split this patch to “freespace trees primitives” and “free space allocator logics” to simplify patchers inspection and landing? Thanks. Best regards, Artem Blagodarenko.