RE: [f2fs-dev] [PATCH 07/10] f2fs: use meta_inode cache to improve roll-forward speed

Chao Yu <chao2.yu@xxxxxxxxxxx> · Mon, 22 Sep 2014 10:36:25 +0800

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@xxxxxxxxxx]
> Sent: Monday, September 15, 2014 6:14 AM
> To: linux-kernel@xxxxxxxxxxxxxxx; linux-fsdevel@xxxxxxxxxxxxxxx;
> linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx
> Cc: Jaegeuk Kim
> Subject: [f2fs-dev] [PATCH 07/10] f2fs: use meta_inode cache to improve roll-forward speed
> 
> Previously, all the dnode pages should be read during the roll-forward recovery.
> Even worsely, whole the chain was traversed twice.
> This patch removes that redundant and costly read operations by using page cache
> of meta_inode and readahead function as well.
> 
> Signed-off-by: Jaegeuk Kim <jaegeuk@xxxxxxxxxx>
> ---
>  fs/f2fs/checkpoint.c | 11 ++++++++--
>  fs/f2fs/f2fs.h       |  5 +++--
>  fs/f2fs/recovery.c   | 59 +++++++++++++++++++++++++---------------------------
>  fs/f2fs/segment.h    |  5 +++--
>  4 files changed, 43 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 7262d99..d1ed889 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -82,6 +82,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
>  	case META_SSA:
>  	case META_CP:
>  		return 0;
> +	case META_POR:
> +		return SM_I(sbi)->main_blkaddr + sbi->user_block_count;

Here we will skip virtual over-provision segments, so better to use TOTAL_BLKS(sbi).

>  	default:
>  		BUG();
>  	}
> @@ -90,11 +92,11 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type)
>  /*
>   * Readahead CP/NAT/SIT/SSA pages
>   */
> -int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type)
> +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type)
>  {
>  	block_t prev_blk_addr = 0;
>  	struct page *page;
> -	int blkno = start;
> +	block_t blkno = start;
>  	int max_blks = get_max_meta_blks(sbi, type);
> 
>  	struct f2fs_io_info fio = {
> @@ -128,6 +130,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int
> type)
>  			/* get ssa/cp block addr */
>  			blk_addr = blkno;
>  			break;
> +		case META_POR:
> +			if (unlikely(blkno >= max_blks))
> +				goto out;
> +			blk_addr = blkno;
> +			break;

The real modification in patch which is merged to dev of f2fs is as following:

- /* get ssa/cp block addr */
+ case META_POR:
+ if (blkno >= max_blks || blkno < min_blks)
+ goto out;

IMHO, it's better to verify boundary separately for META_{SSA,CP,POR} with unlikely.
How do you think?

>  		default:
>  			BUG();
>  		}
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index 4f84d2a..48d7d46 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -103,7 +103,8 @@ enum {
>  	META_CP,
>  	META_NAT,
>  	META_SIT,
> -	META_SSA
> +	META_SSA,
> +	META_POR,
>  };
> 
>  /* for the list of ino */
> @@ -1291,7 +1292,7 @@ void destroy_segment_manager_caches(void);
>   */
>  struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t);
>  struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t);
> -int ra_meta_pages(struct f2fs_sb_info *, int, int, int);
> +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int);
>  long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
>  void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
>  void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
> diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
> index 3736728..6f7fbfa 100644
> --- a/fs/f2fs/recovery.c
> +++ b/fs/f2fs/recovery.c
> @@ -173,7 +173,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head
> *head)
>  {
>  	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
>  	struct curseg_info *curseg;
> -	struct page *page;
> +	struct page *page = NULL;
>  	block_t blkaddr;
>  	int err = 0;
> 
> @@ -181,20 +181,19 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head
> *head)
>  	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
>  	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
> 
> -	/* read node page */
> -	page = alloc_page(GFP_F2FS_ZERO);
> -	if (!page)
> -		return -ENOMEM;
> -	lock_page(page);
> -
>  	while (1) {
>  		struct fsync_inode_entry *entry;
> 
> -		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
> -		if (err)
> -			return err;
> +		if (blkaddr < SM_I(sbi)->main_blkaddr ||
> +					blkaddr > TOTAL_BLKS(sbi))

blkaddr >= TOTAL_BLKS(sbi) ?

> +			break;
> 
> -		lock_page(page);
> +		page = get_meta_page(sbi, blkaddr);
> +		if (IS_ERR(page))
> +			return PTR_ERR(page);
> +
> +		ra_meta_pages(sbi, next_blkaddr_of_node(page),
> +				MAX_BIO_BLOCKS(max_hw_blocks(sbi)), META_POR);

In first round we readahead [0,n] pages, then in second round we readahead [1,n+1] pages,
but as [1,n] pages is already uptodate, so we will readahead only one page, the same for the
following rounds. It's low efficiency.

So how about modify in find_fsync_dnodes/recover_data like this:

	block_t blkaddr, ra_start_blk;
	int err = 0;

	/* get node pages in the current segment */
	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
	ra_start_blk = blkaddr;

	while (1) {
		struct fsync_inode_entry *entry;

		if (blkaddr < SM_I(sbi)->main_blkaddr ||
					blkaddr > TOTAL_BLKS(sbi))
			return 0;

		if (ra_start_blk == blkaddr) {
			ra_meta_pages(sbi, ra_start_blk, MAX_BIO_BLOCKS(sbi),
								META_POR);
			ra_start_blk += MAX_BIO_BLOCKS(sbi);
		}

		page = get_meta_page(sbi, blkaddr);

Thanks,
Yu

> 
>  		if (cp_ver != cpver_of_node(page))
>  			break;
> @@ -243,11 +242,9 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head
> *head)
>  next:
>  		/* check next segment */
>  		blkaddr = next_blkaddr_of_node(page);
> +		f2fs_put_page(page, 1);
>  	}
> -
> -	unlock_page(page);
> -	__free_pages(page, 0);
> -
> +	f2fs_put_page(page, 1);
>  	return err;
>  }
> 
> @@ -427,7 +424,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
>  {
>  	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
>  	struct curseg_info *curseg;
> -	struct page *page;
> +	struct page *page = NULL;
>  	int err = 0;
>  	block_t blkaddr;
> 
> @@ -435,21 +432,19 @@ static int recover_data(struct f2fs_sb_info *sbi,
>  	curseg = CURSEG_I(sbi, type);
>  	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
> 
> -	/* read node page */
> -	page = alloc_page(GFP_F2FS_ZERO);
> -	if (!page)
> -		return -ENOMEM;
> -
> -	lock_page(page);
> -
>  	while (1) {
>  		struct fsync_inode_entry *entry;
> 
> -		err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC);
> -		if (err)
> -			return err;
> +		if (blkaddr < SM_I(sbi)->main_blkaddr ||
> +					blkaddr > TOTAL_BLKS(sbi))
> +			break;
> 
> -		lock_page(page);
> +		page = get_meta_page(sbi, blkaddr);
> +		if (IS_ERR(page))
> +			return PTR_ERR(page);
> +
> +		ra_meta_pages(sbi, next_blkaddr_of_node(page),
> +				MAX_BIO_BLOCKS(max_hw_blocks(sbi)), META_POR);
> 
>  		if (cp_ver != cpver_of_node(page))
>  			break;
> @@ -477,11 +472,9 @@ static int recover_data(struct f2fs_sb_info *sbi,
>  next:
>  		/* check next segment */
>  		blkaddr = next_blkaddr_of_node(page);
> +		f2fs_put_page(page, 1);
>  	}
> -
> -	unlock_page(page);
> -	__free_pages(page, 0);
> -
> +	f2fs_put_page(page, 1);
>  	if (!err)
>  		allocate_new_segments(sbi);
>  	return err;
> @@ -527,6 +520,10 @@ out:
>  	destroy_fsync_dnodes(&inode_list);
>  	kmem_cache_destroy(fsync_entry_slab);
> 
> +	/* truncate meta pages to be used by the recovery */
> +	truncate_inode_pages_range(META_MAPPING(sbi),
> +		SM_I(sbi)->main_blkaddr << PAGE_CACHE_SHIFT, -1);
> +
>  	if (err) {
>  		truncate_inode_pages_final(NODE_MAPPING(sbi));
>  		truncate_inode_pages_final(META_MAPPING(sbi));
> diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
> index 013aed2..c6f37f2 100644
> --- a/fs/f2fs/segment.h
> +++ b/fs/f2fs/segment.h
> @@ -87,6 +87,7 @@
>  	(BITS_TO_LONGS(nr) * sizeof(unsigned long))
>  #define TOTAL_SEGS(sbi)	(SM_I(sbi)->main_segments)
>  #define TOTAL_SECS(sbi)	(sbi->total_sections)
> +#define TOTAL_BLKS(sbi)	(SM_I(sbi)->segment_count << sbi->log_blocks_per_seg)
> 
>  #define SECTOR_FROM_BLOCK(sbi, blk_addr)				\
>  	(((sector_t)blk_addr) << (sbi)->log_sectors_per_block)
> @@ -553,7 +554,7 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int
> segno)
>  static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
>  {
>  	struct f2fs_sm_info *sm_info = SM_I(sbi);
> -	block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
> +	block_t total_blks = TOTAL_BLKS(sbi);
>  	block_t start_addr = sm_info->seg0_blkaddr;
>  	block_t end_addr = start_addr + total_blks - 1;
>  	BUG_ON(blk_addr < start_addr);
> @@ -606,7 +607,7 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int
> segno)
>  static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
>  {
>  	struct f2fs_sm_info *sm_info = SM_I(sbi);
> -	block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg;
> +	block_t total_blks = TOTAL_BLKS(sbi);
>  	block_t start_addr = sm_info->seg0_blkaddr;
>  	block_t end_addr = start_addr + total_blks - 1;
> 
> --
> 1.8.5.2 (Apple Git-48)
> 
> 
> ------------------------------------------------------------------------------
> Want excitement?
> Manually upgrade your production database.
> When you want reliability, choose Perforce
> Perforce version control. Predictably reliable.
> http://pubads.g.doubleclick.net/gampad/clk?id=157508191&iu=/4140/ostg.clktrk
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@xxxxxxxxxxxxxxxxxxxxx
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html