Re: [PATCH] ext4: expand commit callback

Bobi Jam <bobijam@xxxxxxxxxxxxx> · Fri, 04 Nov 2011 20:17:49 +0800

ping. Any comment on this patch proposal? Thanks.

On 2011-10-26 18:49, Bobi Jam wrote:
> The per-commit callback is now used by mballoc code to manage free
> space bitmaps after deleted blocks have been released. This patch
> expand it to contain multiple different callbacks.
> 
> Signed-off-by: Bobi Jam <bobijam@xxxxxxxxxxxxx>
> Signed-off-by: Andreas Dilger <adilger@xxxxxxxxxxxxx>
> ---
>  fs/ext4/ext4_jbd2.h |   72 ++++++++++++++++++++++++
>  fs/ext4/mballoc.c   |  150 ++++++++++++++++++++++++---------------------------
>  fs/ext4/mballoc.h   |   18 ++++---
>  fs/ext4/super.c     |   18 ++++++
>  4 files changed, 170 insertions(+), 88 deletions(-)
> 
> diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
> index 5802fa1..c65a774 100644
> --- a/fs/ext4/ext4_jbd2.h
> +++ b/fs/ext4/ext4_jbd2.h
> @@ -104,6 +104,78 @@
>  #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
>  #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
>  
> +/**
> + *   struct ext4_journal_cb_entry - Base structure for callback information.
> + *
> + *   This struct is a 'seed' structure for a using with your own callback
> + *   structs. If you are using callbacks you must allocate one of these
> + *   or another struct of your own definition which has this struct
> + *   as it's first element and pass it to ext4_journal_callback_add().
> + */
> +struct ext4_journal_cb_entry {
> +	/* list information for other callbacks attached to the same handle */
> +	struct list_head jce_list;
> +
> +	/*  Function to call with this callback structure */
> +	void (*jce_func)(struct super_block *sb,
> +			 struct ext4_journal_cb_entry *jce, int error);
> +
> +	/* user data goes here */
> +};
> +
> +/**
> + * ext4_journal_callback_add: add a function to call after transaction commit
> + * @handle: active journal transaction handle to register callback on
> + * @func: callback function to call after the transaction has committed:
> + *        @sb: superblock of current filesystem for transaction
> + *        @jce: returned journal callback data
> + *        @rc: journal state at commit (0 = transaction committed properly)
> + * @jce: journal callback data (internal and function private data struct)
> + *
> + * The registered function will be called in the context of the journal thread
> + * after the transaction for which the handle was created has completed.
> + *
> + * No locks are held when the callback function is called, so it is safe to
> + * call blocking functions from within the callback, but the callback should
> + * not block or run for too long, or the filesystem will be blocked waiting for
> + * the next transaction to commit. No journaling functions can be used, or
> + * there is a risk of deadlock.
> + *
> + * There is no guaranteed calling order of multiple registered callbacks on
> + * the same transaction.
> + */
> +static inline void ext4_journal_callback_add(handle_t *handle,
> +			void (*func)(struct super_block *sb,
> +				     struct ext4_journal_cb_entry *jce,
> +				     int rc),
> +			struct ext4_journal_cb_entry *jce)
> +{
> +	struct ext4_sb_info *sbi =
> +			EXT4_SB(handle->h_transaction->t_journal->j_private);
> +
> +	/* Add the jce to transaction's private list */
> +	jce->jce_func = func;
> +	spin_lock(&sbi->s_md_lock);
> +	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
> +	spin_unlock(&sbi->s_md_lock);
> +}
> +
> +/**
> + * ext4_journal_callback_del: delete a registered callback
> + * @handle: active journal transaction handle on which callback was registered
> + * @jce: registered journal callback entry to unregister
> + */
> +static inline void ext4_journal_callback_del(handle_t *handle,
> +					     struct ext4_journal_cb_entry *jce)
> +{
> +	struct ext4_sb_info *sbi =
> +			EXT4_SB(handle->h_transaction->t_journal->j_private);
> +
> +	spin_lock(&sbi->s_md_lock);
> +	list_del_init(&jce->jce_list);
> +	spin_unlock(&sbi->s_md_lock);
> +}
> +
>  int
>  ext4_mark_iloc_dirty(handle_t *handle,
>  		     struct inode *inode,
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 17a5a57..b90da57 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -21,6 +21,7 @@
>   * mballoc.c contains the multiblocks allocation routines
>   */
>  
> +#include "ext4_jbd2.h"
>  #include "mballoc.h"
>  #include <linux/debugfs.h>
>  #include <linux/slab.h>
> @@ -338,7 +339,7 @@
>   */
>  static struct kmem_cache *ext4_pspace_cachep;
>  static struct kmem_cache *ext4_ac_cachep;
> -static struct kmem_cache *ext4_free_ext_cachep;
> +static struct kmem_cache *ext4_free_data_cachep;
>  
>  /* We create slab caches for groupinfo data structures based on the
>   * superblock block size.  There will be one per mounted filesystem for
> @@ -356,7 +357,8 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
>  					ext4_group_t group);
>  static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
>  						ext4_group_t group);
> -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
> +static void ext4_free_data_callback(struct super_block *sb,
> +					struct ext4_journal_cb_entry *jce, int error);
>  
>  static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
>  {
> @@ -2511,8 +2513,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
>  		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
>  				 &ext4_mb_seq_groups_fops, sb);
>  
> -	if (sbi->s_journal)
> -		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
>  out:
>  	if (ret) {
>  		kfree(sbi->s_mb_offsets);
> @@ -2616,58 +2616,54 @@ static inline int ext4_issue_discard(struct super_block *sb,
>   * This function is called by the jbd2 layer once the commit has finished,
>   * so we know we can free the blocks that were released with that commit.
>   */
> -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
> +static void ext4_free_data_callback(struct super_block *sb,
> +				    struct ext4_journal_cb_entry *jce,
> +				    int rc)
>  {
> -	struct super_block *sb = journal->j_private;
> +	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
>  	struct ext4_buddy e4b;
>  	struct ext4_group_info *db;
>  	int err, count = 0, count2 = 0;
> -	struct ext4_free_data *entry;
> -	struct list_head *l, *ltmp;
>  
> -	list_for_each_safe(l, ltmp, &txn->t_private_list) {
> -		entry = list_entry(l, struct ext4_free_data, list);
> +	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
> +		 entry->efd_count, entry->efd_group, entry);
>  
> -		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
> -			 entry->count, entry->group, entry);
> +	if (test_opt(sb, DISCARD))
> +		ext4_issue_discard(sb, entry->efd_group,
> +				   entry->efd_start_blk, entry->efd_count);
>  
> -		if (test_opt(sb, DISCARD))
> -			ext4_issue_discard(sb, entry->group,
> -					   entry->start_blk, entry->count);
> +	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
> +	/* we expect to find existing buddy because it's pinned */
> +	BUG_ON(err != 0);
>  
> -		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
> -		/* we expect to find existing buddy because it's pinned */
> -		BUG_ON(err != 0);
> +	db = e4b.bd_info;
> +	/* there are blocks to put in buddy to make them really free */
> +	count += entry->efd_count;
> +	count2++;
> +	ext4_lock_group(sb, entry->efd_group);
> +	/* Take it out of per group rb tree */
> +	rb_erase(&entry->efd_node, &(db->bb_free_root));
> +	mb_free_blocks(NULL, &e4b, entry->efd_start_blk, entry->efd_count);
>  
> -		db = e4b.bd_info;
> -		/* there are blocks to put in buddy to make them really free */
> -		count += entry->count;
> -		count2++;
> -		ext4_lock_group(sb, entry->group);
> -		/* Take it out of per group rb tree */
> -		rb_erase(&entry->node, &(db->bb_free_root));
> -		mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count);
> +	/*
> +	 * Clear the trimmed flag for the group so that the next
> +	 * ext4_trim_fs can trim it.
> +	 * If the volume is mounted with -o discard, online discard
> +	 * is supported and the free blocks will be trimmed online.
> +	 */
> +	if (!test_opt(sb, DISCARD))
> +		EXT4_MB_GRP_CLEAR_TRIMMED(db);
>  
> -		/*
> -		 * Clear the trimmed flag for the group so that the next
> -		 * ext4_trim_fs can trim it.
> -		 * If the volume is mounted with -o discard, online discard
> -		 * is supported and the free blocks will be trimmed online.
> +	if (!db->bb_free_root.rb_node) {
> +		/* No more items in the per group rb tree
> +		 * balance refcounts from ext4_mb_free_metadata()
>  		 */
> -		if (!test_opt(sb, DISCARD))
> -			EXT4_MB_GRP_CLEAR_TRIMMED(db);
> -
> -		if (!db->bb_free_root.rb_node) {
> -			/* No more items in the per group rb tree
> -			 * balance refcounts from ext4_mb_free_metadata()
> -			 */
> -			page_cache_release(e4b.bd_buddy_page);
> -			page_cache_release(e4b.bd_bitmap_page);
> -		}
> -		ext4_unlock_group(sb, entry->group);
> -		kmem_cache_free(ext4_free_ext_cachep, entry);
> -		ext4_mb_unload_buddy(&e4b);
> +		page_cache_release(e4b.bd_buddy_page);
> +		page_cache_release(e4b.bd_bitmap_page);
>  	}
> +	ext4_unlock_group(sb, entry->efd_group);
> +	kmem_cache_free(ext4_free_data_cachep, entry);
> +	ext4_mb_unload_buddy(&e4b);
>  
>  	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
>  }
> @@ -2720,9 +2716,9 @@ int __init ext4_init_mballoc(void)
>  		return -ENOMEM;
>  	}
>  
> -	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
> -					  SLAB_RECLAIM_ACCOUNT);
> -	if (ext4_free_ext_cachep == NULL) {
> +	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
> +					   SLAB_RECLAIM_ACCOUNT);
> +	if (ext4_free_data_cachep == NULL) {
>  		kmem_cache_destroy(ext4_pspace_cachep);
>  		kmem_cache_destroy(ext4_ac_cachep);
>  		return -ENOMEM;
> @@ -2740,7 +2736,7 @@ void ext4_exit_mballoc(void)
>  	rcu_barrier();
>  	kmem_cache_destroy(ext4_pspace_cachep);
>  	kmem_cache_destroy(ext4_ac_cachep);
> -	kmem_cache_destroy(ext4_free_ext_cachep);
> +	kmem_cache_destroy(ext4_free_data_cachep);
>  	ext4_groupinfo_destroy_slabs();
>  	ext4_remove_debugfs_entry();
>  }
> @@ -3290,8 +3286,8 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
>  	n = rb_first(&(grp->bb_free_root));
>  
>  	while (n) {
> -		entry = rb_entry(n, struct ext4_free_data, node);
> -		ext4_set_bits(bitmap, entry->start_blk, entry->count);
> +		entry = rb_entry(n, struct ext4_free_data, efd_node);
> +		ext4_set_bits(bitmap, entry->efd_start_blk, entry->efd_count);
>  		n = rb_next(n);
>  	}
>  	return;
> @@ -4386,9 +4382,9 @@ out:
>  static int can_merge(struct ext4_free_data *entry1,
>  			struct ext4_free_data *entry2)
>  {
> -	if ((entry1->t_tid == entry2->t_tid) &&
> -	    (entry1->group == entry2->group) &&
> -	    ((entry1->start_blk + entry1->count) == entry2->start_blk))
> +	if ((entry1->efd_tid == entry2->efd_tid) &&
> +	    (entry1->efd_group == entry2->efd_group) &&
> +	    ((entry1->efd_start_blk + entry1->efd_count) == entry2->efd_start_blk))
>  		return 1;
>  	return 0;
>  }
> @@ -4402,7 +4398,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	struct ext4_free_data *entry;
>  	struct ext4_group_info *db = e4b->bd_info;
>  	struct super_block *sb = e4b->bd_sb;
> -	struct ext4_sb_info *sbi = EXT4_SB(sb);
>  	struct rb_node **n = &db->bb_free_root.rb_node, *node;
>  	struct rb_node *parent = NULL, *new_node;
>  
> @@ -4410,8 +4405,8 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	BUG_ON(e4b->bd_bitmap_page == NULL);
>  	BUG_ON(e4b->bd_buddy_page == NULL);
>  
> -	new_node = &new_entry->node;
> -	block = new_entry->start_blk;
> +	new_node = &new_entry->efd_node;
> +	block = new_entry->efd_start_blk;
>  
>  	if (!*n) {
>  		/* first free block exent. We need to
> @@ -4424,10 +4419,10 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	}
>  	while (*n) {
>  		parent = *n;
> -		entry = rb_entry(parent, struct ext4_free_data, node);
> -		if (block < entry->start_blk)
> +		entry = rb_entry(parent, struct ext4_free_data, efd_node);
> +		if (block < entry->efd_start_blk)
>  			n = &(*n)->rb_left;
> -		else if (block >= (entry->start_blk + entry->count))
> +		else if (block >= (entry->efd_start_blk + entry->efd_count))
>  			n = &(*n)->rb_right;
>  		else {
>  			ext4_grp_locked_error(sb, group, 0,
> @@ -4443,34 +4438,29 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
>  	/* Now try to see the extent can be merged to left and right */
>  	node = rb_prev(new_node);
>  	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, node);
> +		entry = rb_entry(node, struct ext4_free_data, efd_node);
>  		if (can_merge(entry, new_entry)) {
> -			new_entry->start_blk = entry->start_blk;
> -			new_entry->count += entry->count;
> +			new_entry->efd_start_blk = entry->efd_start_blk;
> +			new_entry->efd_count += entry->efd_count;
>  			rb_erase(node, &(db->bb_free_root));
> -			spin_lock(&sbi->s_md_lock);
> -			list_del(&entry->list);
> -			spin_unlock(&sbi->s_md_lock);
> -			kmem_cache_free(ext4_free_ext_cachep, entry);
> +			ext4_journal_callback_del(handle, &entry->efd_jce);
> +			kmem_cache_free(ext4_free_data_cachep, entry);
>  		}
>  	}
>  
>  	node = rb_next(new_node);
>  	if (node) {
> -		entry = rb_entry(node, struct ext4_free_data, node);
> +		entry = rb_entry(node, struct ext4_free_data, efd_node);
>  		if (can_merge(new_entry, entry)) {
> -			new_entry->count += entry->count;
> +			new_entry->efd_count += entry->efd_count;
>  			rb_erase(node, &(db->bb_free_root));
> -			spin_lock(&sbi->s_md_lock);
> -			list_del(&entry->list);
> -			spin_unlock(&sbi->s_md_lock);
> -			kmem_cache_free(ext4_free_ext_cachep, entry);
> +			ext4_journal_callback_del(handle, &entry->efd_jce);
> +			kmem_cache_free(ext4_free_data_cachep, entry);
>  		}
>  	}
>  	/* Add the extent to transaction's private list */
> -	spin_lock(&sbi->s_md_lock);
> -	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
> -	spin_unlock(&sbi->s_md_lock);
> +	ext4_journal_callback_add(handle, ext4_free_data_callback,
> +				  &new_entry->efd_jce);
>  	return 0;
>  }
>  
> @@ -4613,15 +4603,15 @@ do_more:
>  		 * blocks being freed are metadata. these blocks shouldn't
>  		 * be used until this transaction is committed
>  		 */
> -		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
> +		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
>  		if (!new_entry) {
>  			err = -ENOMEM;
>  			goto error_return;
>  		}
> -		new_entry->start_blk = bit;
> -		new_entry->group  = block_group;
> -		new_entry->count = count;
> -		new_entry->t_tid = handle->h_transaction->t_tid;
> +		new_entry->efd_start_blk = bit;
> +		new_entry->efd_group  = block_group;
> +		new_entry->efd_count = count;
> +		new_entry->efd_tid = handle->h_transaction->t_tid;
>  
>  		ext4_lock_group(sb, block_group);
>  		mb_clear_bits(bitmap_bh->b_data, bit, count);
> diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
> index 9d4a636..1e47c2e 100644
> --- a/fs/ext4/mballoc.h
> +++ b/fs/ext4/mballoc.h
> @@ -96,21 +96,23 @@ extern u8 mb_enable_debug;
>  
>  
>  struct ext4_free_data {
> -	/* this links the free block information from group_info */
> -	struct rb_node node;
> +	/* MUST be the first member */
> +	struct ext4_journal_cb_entry	efd_jce;
> +
> +	/* ext4_free_data private data starts from here */
>  
> -	/* this links the free block information from ext4_sb_info */
> -	struct list_head list;
> +	/* this links the free block information from group_info */
> +	struct rb_node			efd_node;
>  
>  	/* group which free block extent belongs */
> -	ext4_group_t group;
> +	ext4_group_t			efd_group;
>  
>  	/* free block extent */
> -	ext4_grpblk_t start_blk;
> -	ext4_grpblk_t count;
> +	ext4_grpblk_t			efd_start_blk;
> +	ext4_grpblk_t			efd_count;
>  
>  	/* transaction which freed this extent */
> -	tid_t	t_tid;
> +	tid_t				efd_tid;
>  };
>  
>  struct ext4_prealloc_space {
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 44d0c8d..5e2205e 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -414,6 +414,22 @@ static void save_error_info(struct super_block *sb, const char *func,
>  	ext4_commit_super(sb, 1);
>  }
>  
> +static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
> +{
> +	struct super_block		*sb = journal->j_private;
> +	struct ext4_sb_info		*sbi = EXT4_SB(sb);
> +	int				error = is_journal_aborted(journal);
> +	struct ext4_journal_cb_entry	*jce, *tmp;
> +
> +	spin_lock(&sbi->s_md_lock);
> +	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
> +		list_del_init(&jce->jce_list);
> +		spin_unlock(&sbi->s_md_lock);
> +		jce->jce_func(sb, jce, error);
> +		spin_lock(&sbi->s_md_lock);
> +	}
> +	spin_unlock(&sbi->s_md_lock);
> +}
>  
>  /* Deal with the reporting of failure conditions on a filesystem such as
>   * inconsistencies detected or read IO failures.
> @@ -3605,6 +3621,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
>  	}
>  	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
>  
> +	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
> +
>  	/*
>  	 * The journal may have updated the bg summary counts, so we
>  	 * need to update the global counters.

-- 
Bobi Jam  <bobijam@xxxxxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html