On Tue, 24 Feb 2015 20:01:41 +0100, Andreas Rohner wrote: > This patch adds a small cache to accumulate the small decrements of > the number of live blocks in a segment usage entry. If for example a > large file is deleted, the segment usage entry has to be updated for > every single block. But for every decrement, a MDT write lock has to > be aquired, which blocks the entire SUFILE and effectively turns > this lock into a global lock for the whole file system. > > The cache tries to ameliorate this situation by adding up the > decrements and increments for a given number of segments and > applying the changes all at once. Because the changes are > accumulated in memory and not immediately written to the SUFILE, the > afore mentioned lock only needs to be aquired, if the cache is full > or at the end of the respective operation. > > To effectively get the pointer to the modification cache from the > high level operations down to the update of the individual blocks in > nilfs_dat_commit_end(), a new pointer b_private was added to struct > nilfs_bmap. > > Signed-off-by: Andreas Rohner <andreas.rohner@xxxxxxx> > --- > fs/nilfs2/bmap.c | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/nilfs2/bmap.h | 11 +++++++- > fs/nilfs2/btree.c | 2 +- > fs/nilfs2/direct.c | 2 +- > fs/nilfs2/inode.c | 22 +++++++++++++--- > fs/nilfs2/segment.c | 26 +++++++++++++++--- > fs/nilfs2/segment.h | 3 +++ > 7 files changed, 132 insertions(+), 10 deletions(-) > > diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c > index ecd62ba..927acb7 100644 > --- a/fs/nilfs2/bmap.c > +++ b/fs/nilfs2/bmap.c > @@ -288,6 +288,43 @@ int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key) > } > > /** > + * nilfs_bmap_truncate_with_mc - truncate a bmap to a specified key > + * @bmap: bmap > + * @mc: modification cache > + * @key: key > + * > + * Description: nilfs_bmap_truncate_with_mc() removes key-record pairs whose > + * keys are greater than or equal to @key from @bmap. It has the same > + * functionality as nilfs_bmap_truncate(), but allows the passing > + * of a modification cache to update segment usage information. > + * > + * Return Value: On success, 0 is returned. On error, one of the following > + * negative error codes is returned. > + * > + * %-EIO - I/O error. > + * > + * %-ENOMEM - Insufficient amount of memory available. > + */ > +int nilfs_bmap_truncate_with_mc(struct nilfs_bmap *bmap, > + struct nilfs_sufile_mod_cache *mc, > + unsigned long key) > +{ > + int ret; > + > + down_write(&bmap->b_sem); > + > + bmap->b_private = mc; > + > + ret = nilfs_bmap_do_truncate(bmap, key); > + > + bmap->b_private = NULL; > + > + up_write(&bmap->b_sem); > + > + return nilfs_bmap_convert_error(bmap, __func__, ret); > +} > + > +/** > * nilfs_bmap_clear - free resources a bmap holds > * @bmap: bmap > * > @@ -328,6 +365,43 @@ int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) > } > > /** > + * nilfs_bmap_propagate_with_mc - propagate dirty state > + * @bmap: bmap > + * @mc: modification cache > + * @bh: buffer head > + * > + * Description: nilfs_bmap_propagate_with_mc() marks the buffers that directly > + * or indirectly refer to the block specified by @bh dirty. It has > + * the same functionality as nilfs_bmap_propagate(), but allows the passing > + * of a modification cache to update segment usage information. > + * > + * Return Value: On success, 0 is returned. On error, one of the following > + * negative error codes is returned. > + * > + * %-EIO - I/O error. > + * > + * %-ENOMEM - Insufficient amount of memory available. > + */ > +int nilfs_bmap_propagate_with_mc(struct nilfs_bmap *bmap, > + struct nilfs_sufile_mod_cache *mc, > + struct buffer_head *bh) > +{ > + int ret; > + > + down_write(&bmap->b_sem); > + > + bmap->b_private = mc; > + > + ret = bmap->b_ops->bop_propagate(bmap, bh); > + > + bmap->b_private = NULL; > + > + up_write(&bmap->b_sem); > + > + return nilfs_bmap_convert_error(bmap, __func__, ret); > +} These bmap functions are really bad. The mod cache argument has no meaning with regard to block mapping operation. I really hope we don't have to add these variants by hiding the cache in sufile. > + > +/** > * nilfs_bmap_lookup_dirty_buffers - > * @bmap: bmap > * @listp: pointer to buffer head list > @@ -490,6 +564,7 @@ int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode) > > init_rwsem(&bmap->b_sem); > bmap->b_state = 0; > + bmap->b_private = NULL; > bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; > switch (bmap->b_inode->i_ino) { > case NILFS_DAT_INO: > @@ -551,6 +626,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) > bmap->b_last_allocated_key = 0; > bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR; > bmap->b_state = 0; > + bmap->b_private = NULL; > nilfs_btree_init_gc(bmap); > } > > diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h > index 718c814..a8b935a 100644 > --- a/fs/nilfs2/bmap.h > +++ b/fs/nilfs2/bmap.h > @@ -36,6 +36,7 @@ > > > struct nilfs_bmap; > +struct nilfs_sufile_mod_cache; > > /** > * union nilfs_bmap_ptr_req - request for bmap ptr > @@ -106,6 +107,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) > * @b_ptr_type: pointer type > * @b_state: state > * @b_nchildren_per_block: maximum number of child nodes for non-root nodes > + * @b_private: pointer for extra data > */ > struct nilfs_bmap { > union { > @@ -120,6 +122,7 @@ struct nilfs_bmap { > int b_ptr_type; > int b_state; > __u16 b_nchildren_per_block; > + void *b_private; > }; > > /* pointer type */ > @@ -157,8 +160,14 @@ int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); > int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); > int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *); > int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long); > +int nilfs_bmap_truncate_with_mc(struct nilfs_bmap *, > + struct nilfs_sufile_mod_cache *, > + unsigned long); > void nilfs_bmap_clear(struct nilfs_bmap *); > int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *); > +int nilfs_bmap_propagate_with_mc(struct nilfs_bmap *, > + struct nilfs_sufile_mod_cache *, > + struct buffer_head *); > void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *); > int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **, > unsigned long, union nilfs_binfo *); > @@ -222,7 +231,7 @@ static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, > struct inode *dat) > { > if (dat) > - nilfs_dat_commit_end(dat, &req->bpr_req, NULL, > + nilfs_dat_commit_end(dat, &req->bpr_req, bmap->b_private, > bmap->b_ptr_type == NILFS_BMAP_PTR_VS, > bmap->b_inode->i_ino != NILFS_SUFILE_INO); > } > diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c > index 2af0519..c3c883e 100644 > --- a/fs/nilfs2/btree.c > +++ b/fs/nilfs2/btree.c > @@ -1851,7 +1851,7 @@ static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, > > nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, > &path[level].bp_newreq.bpr_req, > - NULL, > + btree->b_private, > btree->b_ptr_type == NILFS_BMAP_PTR_VS, > btree->b_inode->i_ino != NILFS_SUFILE_INO); > > diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c > index e022cfb..a716bba 100644 > --- a/fs/nilfs2/direct.c > +++ b/fs/nilfs2/direct.c > @@ -272,7 +272,7 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, > if (ret < 0) > return ret; > nilfs_dat_commit_update(dat, &oldreq, &newreq, > - NULL, > + bmap->b_private, > bmap->b_ptr_type == NILFS_BMAP_PTR_VS, > bmap->b_inode->i_ino != NILFS_SUFILE_INO); > set_buffer_nilfs_volatile(bh); > diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c > index 8b59695..7f6d056 100644 > --- a/fs/nilfs2/inode.c > +++ b/fs/nilfs2/inode.c > @@ -34,6 +34,7 @@ > #include "mdt.h" > #include "cpfile.h" > #include "ifile.h" > +#include "sufile.h" > > /** > * struct nilfs_iget_args - arguments used during comparison between inodes > @@ -714,29 +715,42 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) > static void nilfs_truncate_bmap(struct nilfs_inode_info *ii, > unsigned long from) > { > + struct the_nilfs *nilfs = ii->vfs_inode.i_sb->s_fs_info; > + struct nilfs_sufile_mod_cache mc, *mcp = NULL; > unsigned long b; > int ret; > > if (!test_bit(NILFS_I_BMAP, &ii->i_state)) > return; > + > + if (nilfs_feature_track_live_blks(nilfs) && > + !nilfs_sufile_mc_init(&mc, NILFS_SUFILE_MC_SIZE_DEFAULT)) > + mcp = &mc; > + > repeat: > ret = nilfs_bmap_last_key(ii->i_bmap, &b); > if (ret == -ENOENT) > - return; > + goto out_free; > else if (ret < 0) > goto failed; > > if (b < from) > - return; > + goto out_free; > > b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from); > - ret = nilfs_bmap_truncate(ii->i_bmap, b); > + ret = nilfs_bmap_truncate_with_mc(ii->i_bmap, mcp, b); > nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb); > if (!ret || (ret == -ENOMEM && > - nilfs_bmap_truncate(ii->i_bmap, b) == 0)) > + nilfs_bmap_truncate_with_mc(ii->i_bmap, mcp, b) == 0)) > goto repeat; > > +out_free: > + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile, mcp); > + nilfs_sufile_mc_destroy(mcp); > + return; > failed: > + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile, mcp); > + nilfs_sufile_mc_destroy(mcp); > nilfs_warning(ii->vfs_inode.i_sb, __func__, > "failed to truncate bmap (ino=%lu, err=%d)", > ii->vfs_inode.i_ino, ret); > diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c > index 6059f53..dc0070c 100644 > --- a/fs/nilfs2/segment.c > +++ b/fs/nilfs2/segment.c > @@ -511,7 +511,8 @@ static int nilfs_collect_file_data(struct nilfs_sc_info *sci, > { > int err; > > - err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); > + err = nilfs_bmap_propagate_with_mc(NILFS_I(inode)->i_bmap, > + sci->sc_mc, bh); > if (err < 0) > return err; > > @@ -526,7 +527,8 @@ static int nilfs_collect_file_node(struct nilfs_sc_info *sci, > struct buffer_head *bh, > struct inode *inode) > { > - return nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh); > + return nilfs_bmap_propagate_with_mc(NILFS_I(inode)->i_bmap, > + sci->sc_mc, bh); > } > > static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci, > @@ -1386,7 +1388,7 @@ static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, > segbuf->sb_nlive_blks_added = segbuf->sb_sum.nfileblk; > > if (nilfs_feature_track_live_blks(nilfs)) > - nilfs_sufile_mod_nlive_blks(sufile, NULL, > + nilfs_sufile_mod_nlive_blks(sufile, sci->sc_mc, > segbuf->sb_segnum, > segbuf->sb_nlive_blks_added); > } > @@ -2014,6 +2016,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) > } > nilfs_segctor_update_segusage(sci, nilfs); > > + nilfs_sufile_flush_nlive_blks(nilfs->ns_sufile, > + sci->sc_mc); > + > /* Write partial segments */ > nilfs_segctor_prepare_write(sci); > > @@ -2603,6 +2608,7 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, > { > struct the_nilfs *nilfs = sb->s_fs_info; > struct nilfs_sc_info *sci; > + int ret; > > sci = kzalloc(sizeof(*sci), GFP_KERNEL); > if (!sci) > @@ -2633,6 +2639,18 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb, > sci->sc_interval = HZ * nilfs->ns_interval; > if (nilfs->ns_watermark) > sci->sc_watermark = nilfs->ns_watermark; > + > + if (nilfs_feature_track_live_blks(nilfs)) { > + sci->sc_mc = kmalloc(sizeof(*(sci->sc_mc)), GFP_KERNEL); > + if (sci->sc_mc) { > + ret = nilfs_sufile_mc_init(sci->sc_mc, > + NILFS_SUFILE_MC_SIZE_EXT); > + if (ret) { > + kfree(sci->sc_mc); > + sci->sc_mc = NULL; > + } > + } > + } > return sci; > } > > @@ -2701,6 +2719,8 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) > down_write(&nilfs->ns_segctor_sem); > > del_timer_sync(&sci->sc_timer); > + nilfs_sufile_mc_destroy(sci->sc_mc); > + kfree(sci->sc_mc); > kfree(sci); > } > > diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h > index a48d6de..a857527 100644 > --- a/fs/nilfs2/segment.h > +++ b/fs/nilfs2/segment.h > @@ -80,6 +80,7 @@ struct nilfs_cstage { > }; > > struct nilfs_segment_buffer; > +struct nilfs_sufile_mod_cache; > > struct nilfs_segsum_pointer { > struct buffer_head *bh; > @@ -129,6 +130,7 @@ struct nilfs_segsum_pointer { > * @sc_watermark: Watermark for the number of dirty buffers > * @sc_timer: Timer for segctord > * @sc_task: current thread of segctord > + * @sc_mc: mod cache to add up updates for SUFILE during seg construction > */ > struct nilfs_sc_info { > struct super_block *sc_super; > @@ -185,6 +187,7 @@ struct nilfs_sc_info { > > struct timer_list sc_timer; > struct task_struct *sc_task; > + struct nilfs_sufile_mod_cache *sc_mc; > }; > > /* sc_flags */ Again, I really hope you eliminate this changes by hiding the cache in sufile. Regards, Ryusuke Konishi > -- > 2.3.0 > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html