On 2015-03-14 04:51, Ryusuke Konishi wrote: > On Tue, 24 Feb 2015 20:01:44 +0100, Andreas Rohner wrote: >> It doesn't really matter if the number of reclaimable blocks for a >> segment is inaccurate, as long as the overall performance is better than >> the simple timestamp algorithm and starvation is prevented. >> >> The following steps will lead to starvation of a segment: >> >> 1. The segment is written >> 2. A snapshot is created >> 3. The files in the segment are deleted and the number of live >> blocks for the segment is decremented to a very low value >> 4. The GC tries to free the segment, but there are no reclaimable >> blocks, because they are all protected by the snapshot. To prevent an >> infinite loop the GC has to adjust the number of live blocks to the >> correct value. >> 5. The snapshot is converted to a checkpoint and the blocks in the >> segment are now reclaimable. >> 6. The GC will never attemt to clean the segment again, because of it >> incorrectly shows up as having a high number of live blocks. >> >> To prevent this, the already existing padding field of the SUFILE entry >> is used to track the number of snapshot blocks in the segment. This >> number is only set by the GC, since it collects the necessary >> information anyway. So there is no need, to track which block belongs to >> which segment. In step 4 of the list above the GC will set the new field >> su_nsnapshot_blks. In step 5 all entries in the SUFILE are checked and >> entries with a big su_nsnapshot_blks field get their su_nlive_blks field >> reduced. >> >> Signed-off-by: Andreas Rohner <andreas.rohner@xxxxxxx> >> --- >> fs/nilfs2/cpfile.c | 5 ++ >> fs/nilfs2/segbuf.c | 1 + >> fs/nilfs2/segbuf.h | 1 + >> fs/nilfs2/segment.c | 7 ++- >> fs/nilfs2/sufile.c | 114 ++++++++++++++++++++++++++++++++++++++++++---- >> fs/nilfs2/sufile.h | 4 +- >> fs/nilfs2/the_nilfs.h | 7 +++ >> include/linux/nilfs2_fs.h | 12 +++-- >> 8 files changed, 136 insertions(+), 15 deletions(-) >> >> diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c >> index 0d58075..6b61fd7 100644 >> --- a/fs/nilfs2/cpfile.c >> +++ b/fs/nilfs2/cpfile.c >> @@ -28,6 +28,7 @@ >> #include <linux/nilfs2_fs.h> >> #include "mdt.h" >> #include "cpfile.h" >> +#include "sufile.h" >> >> >> static inline unsigned long >> @@ -703,6 +704,7 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) >> struct nilfs_cpfile_header *header; >> struct nilfs_checkpoint *cp; >> struct nilfs_snapshot_list *list; >> + struct the_nilfs *nilfs = cpfile->i_sb->s_fs_info; >> __u64 next, prev; >> void *kaddr; >> int ret; >> @@ -784,6 +786,9 @@ static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno) >> mark_buffer_dirty(header_bh); >> nilfs_mdt_mark_dirty(cpfile); >> >> + if (nilfs_feature_track_snapshots(nilfs)) >> + nilfs_sufile_fix_starving_segs(nilfs->ns_sufile); >> + >> brelse(prev_bh); >> >> out_next: >> diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c >> index bbd807b..a98c576 100644 >> --- a/fs/nilfs2/segbuf.c >> +++ b/fs/nilfs2/segbuf.c >> @@ -59,6 +59,7 @@ struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb) >> segbuf->sb_super_root = NULL; >> segbuf->sb_nlive_blks_added = 0; >> segbuf->sb_nlive_blks_diff = 0; >> + segbuf->sb_nsnapshot_blks = 0; >> >> init_completion(&segbuf->sb_bio_event); >> atomic_set(&segbuf->sb_err, 0); >> diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h >> index 4e994f7..7a462c4 100644 >> --- a/fs/nilfs2/segbuf.h >> +++ b/fs/nilfs2/segbuf.h >> @@ -85,6 +85,7 @@ struct nilfs_segment_buffer { >> unsigned sb_rest_blocks; >> __u32 sb_nlive_blks_added; >> __s64 sb_nlive_blks_diff; >> + __u32 sb_nsnapshot_blks; >> >> /* Buffers */ >> struct list_head sb_segsum_buffers; >> diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c >> index 16c7c36..b976198 100644 >> --- a/fs/nilfs2/segment.c >> +++ b/fs/nilfs2/segment.c >> @@ -1381,6 +1381,7 @@ static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci, >> (segbuf->sb_pseg_start - segbuf->sb_fseg_start); >> ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, >> live_blocks, >> + segbuf->sb_nsnapshot_blks, >> sci->sc_seg_ctime); >> WARN_ON(ret); /* always succeed because the segusage is dirty */ >> >> @@ -1405,7 +1406,7 @@ static void nilfs_cancel_segusage(struct list_head *logs, >> segbuf = NILFS_FIRST_SEGBUF(logs); >> ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, >> segbuf->sb_pseg_start - >> - segbuf->sb_fseg_start, 0); >> + segbuf->sb_fseg_start, 0, 0); >> WARN_ON(ret); /* always succeed because the segusage is dirty */ >> >> if (nilfs_feature_track_live_blks(nilfs)) >> @@ -1416,7 +1417,7 @@ static void nilfs_cancel_segusage(struct list_head *logs, >> >> list_for_each_entry_continue(segbuf, logs, sb_list) { >> ret = nilfs_sufile_set_segment_usage(sufile, segbuf->sb_segnum, >> - 0, 0); >> + 0, 0, 0); >> WARN_ON(ret); /* always succeed */ >> } >> } >> @@ -1521,6 +1522,8 @@ static void nilfs_segctor_dec_nlive_blks_gc(struct inode *dat, >> >> if (!buffer_nilfs_snapshot(bh) && isreclaimable) >> segbuf->sb_nlive_blks_diff--; >> + if (buffer_nilfs_snapshot(bh)) >> + segbuf->sb_nsnapshot_blks++; >> } >> >> /** >> diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c >> index 574a77e..a6dc7bf 100644 >> --- a/fs/nilfs2/sufile.c >> +++ b/fs/nilfs2/sufile.c >> @@ -468,7 +468,7 @@ void nilfs_sufile_do_scrap(struct inode *sufile, __u64 *data, >> su->su_flags = cpu_to_le32(1UL << NILFS_SEGMENT_USAGE_DIRTY); >> if (nilfs_sufile_ext_supported(sufile)) { >> su->su_nlive_blks = cpu_to_le32(0); >> - su->su_pad = cpu_to_le32(0); >> + su->su_nsnapshot_blks = cpu_to_le32(0); >> su->su_nlive_lastmod = cpu_to_le64(0); >> } >> kunmap_atomic(kaddr); >> @@ -538,7 +538,8 @@ int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum) >> * @modtime: modification time (option) >> */ >> int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, >> - unsigned long nblocks, time_t modtime) >> + unsigned long nblocks, __u32 nsnapshot_blks, >> + time_t modtime) >> { >> struct buffer_head *bh; >> struct nilfs_segment_usage *su; >> @@ -556,9 +557,18 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, >> if (modtime) >> su->su_lastmod = cpu_to_le64(modtime); >> su->su_nblocks = cpu_to_le32(nblocks); >> - if (nilfs_sufile_ext_supported(sufile) && >> - nblocks < le32_to_cpu(su->su_nlive_blks)) >> - su->su_nlive_blks = su->su_nblocks; >> + if (nilfs_sufile_ext_supported(sufile)) { >> + if (nblocks < le32_to_cpu(su->su_nlive_blks)) >> + su->su_nlive_blks = su->su_nblocks; >> + >> + nsnapshot_blks += le32_to_cpu(su->su_nsnapshot_blks); >> + >> + if (nblocks < nsnapshot_blks) >> + nsnapshot_blks = nblocks; >> + >> + su->su_nsnapshot_blks = cpu_to_le32(nsnapshot_blks); >> + } >> + >> kunmap_atomic(kaddr); >> >> mark_buffer_dirty(bh); >> @@ -891,7 +901,7 @@ ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum, void *buf, >> >> if (sisz >= NILFS_EXT_SUINFO_SIZE) { >> si->sui_nlive_blks = nlb; >> - si->sui_pad = 0; >> + si->sui_nsnapshot_blks = 0; >> si->sui_nlive_lastmod = lm; >> } >> } >> @@ -939,6 +949,7 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, >> int ret = 0; >> bool sup_ext = (supsz >= NILFS_EXT_SUINFO_UPDATE_SIZE); >> bool su_ext = nilfs_sufile_ext_supported(sufile); >> + bool supsu_ext = sup_ext && su_ext; >> >> if (unlikely(nsup == 0)) >> return ret; >> @@ -952,6 +963,10 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, >> nilfs->ns_blocks_per_segment) >> || (nilfs_suinfo_update_nlive_blks(sup) && sup_ext && >> sup->sup_sui.sui_nlive_blks > >> + nilfs->ns_blocks_per_segment) >> + || (nilfs_suinfo_update_nsnapshot_blks(sup) && >> + sup_ext && >> + sup->sup_sui.sui_nsnapshot_blks > >> nilfs->ns_blocks_per_segment)) >> return -EINVAL; >> } >> @@ -979,11 +994,15 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, >> if (nilfs_suinfo_update_nblocks(sup)) >> su->su_nblocks = cpu_to_le32(sup->sup_sui.sui_nblocks); >> >> - if (nilfs_suinfo_update_nlive_blks(sup) && sup_ext && su_ext) >> + if (nilfs_suinfo_update_nlive_blks(sup) && supsu_ext) >> su->su_nlive_blks = >> cpu_to_le32(sup->sup_sui.sui_nlive_blks); >> >> - if (nilfs_suinfo_update_nlive_lastmod(sup) && sup_ext && su_ext) >> + if (nilfs_suinfo_update_nsnapshot_blks(sup) && supsu_ext) >> + su->su_nsnapshot_blks = >> + cpu_to_le32(sup->sup_sui.sui_nsnapshot_blks); >> + >> + if (nilfs_suinfo_update_nlive_lastmod(sup) && supsu_ext) >> su->su_nlive_lastmod = >> cpu_to_le64(sup->sup_sui.sui_nlive_lastmod); >> >> @@ -1050,6 +1069,85 @@ ssize_t nilfs_sufile_set_suinfo(struct inode *sufile, void *buf, >> } >> >> /** >> + * nilfs_sufile_fix_starving_segs - fix potentially starving segments >> + * @sufile: inode of segment usage file >> + * >> + * Description: Scans for segments, which are potentially starving and >> + * reduces the number of live blocks to less than half of the maximum >> + * number of blocks in a segment. This way the segment is more likely to be >> + * chosen by the GC. A segment is marked as potentially starving, if more >> + * than half of the blocks it contains are protected by snapshots. >> + * >> + * Return Value: On success, 0 is returned and on error, one of the >> + * following negative error codes is returned. >> + * >> + * %-EIO - I/O error. >> + * >> + * %-ENOMEM - Insufficient amount of memory available. >> + */ >> +int nilfs_sufile_fix_starving_segs(struct inode *sufile) >> +{ >> + struct buffer_head *su_bh; >> + struct nilfs_segment_usage *su; >> + size_t n, i, susz = NILFS_MDT(sufile)->mi_entry_size; >> + struct the_nilfs *nilfs = sufile->i_sb->s_fs_info; >> + void *kaddr; >> + unsigned long nsegs, segusages_per_block; >> + __u32 max_segblks = nilfs->ns_blocks_per_segment / 2; >> + __u64 segnum = 0; >> + int ret = 0, blkdirty, dirty = 0; >> + >> + down_write(&NILFS_MDT(sufile)->mi_sem); >> + >> + segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile); >> + nsegs = nilfs_sufile_get_nsegments(sufile); >> + >> + while (segnum < nsegs) { >> + n = nilfs_sufile_segment_usages_in_block(sufile, segnum, >> + nsegs - 1); >> + >> + ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, >> + 0, &su_bh); >> + if (ret < 0) { >> + if (ret != -ENOENT) >> + goto out; >> + /* hole */ >> + segnum += n; >> + continue; >> + } >> + >> + kaddr = kmap_atomic(su_bh->b_page); >> + su = nilfs_sufile_block_get_segment_usage(sufile, segnum, >> + su_bh, kaddr); >> + blkdirty = 0; >> + for (i = 0; i < n; ++i, ++segnum, su = (void *)su + susz) { >> + if (le32_to_cpu(su->su_nsnapshot_blks) <= max_segblks) >> + continue; >> + >> + if (su->su_nlive_blks <= max_segblks) >> + continue; >> + >> + su->su_nlive_blks = max_segblks; >> + blkdirty = 1; >> + } >> + >> + kunmap_atomic(kaddr); >> + if (blkdirty) { >> + mark_buffer_dirty(su_bh); >> + dirty = 1; >> + } >> + put_bh(su_bh); >> + } >> + >> +out: >> + if (dirty) >> + nilfs_mdt_mark_dirty(sufile); >> + >> + up_write(&NILFS_MDT(sufile)->mi_sem); >> + return ret; >> +} >> + >> +/** >> * nilfs_sufile_trim_fs() - trim ioctl handle function >> * @sufile: inode of segment usage file >> * @range: fstrim_range structure >> diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h >> index ae3c52a..e831622 100644 >> --- a/fs/nilfs2/sufile.h >> +++ b/fs/nilfs2/sufile.h >> @@ -45,7 +45,8 @@ int nilfs_sufile_set_alloc_range(struct inode *sufile, __u64 start, __u64 end); >> int nilfs_sufile_alloc(struct inode *, __u64 *); >> int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum); >> int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum, >> - unsigned long nblocks, time_t modtime); >> + unsigned long nblocks, __u32 nsnapshot_blks, >> + time_t modtime); >> int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *); >> ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, void *, unsigned, >> size_t); >> @@ -72,6 +73,7 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs); >> int nilfs_sufile_read(struct super_block *sb, size_t susize, >> struct nilfs_inode *raw_inode, struct inode **inodep); >> int nilfs_sufile_trim_fs(struct inode *sufile, struct fstrim_range *range); >> +int nilfs_sufile_fix_starving_segs(struct inode *); >> >> /** >> * nilfs_sufile_scrap - make a segment garbage >> diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h >> index 87cab10..3d495f1 100644 >> --- a/fs/nilfs2/the_nilfs.h >> +++ b/fs/nilfs2/the_nilfs.h >> @@ -409,4 +409,11 @@ static inline int nilfs_feature_track_live_blks(struct the_nilfs *nilfs) >> NILFS_FEATURE_COMPAT_SUFILE_EXTENSION); >> } >> >> +static inline int nilfs_feature_track_snapshots(struct the_nilfs *nilfs) >> +{ >> + return (nilfs->ns_feature_compat & >> + NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS) && >> + nilfs_feature_track_live_blks(nilfs); >> +} >> + >> #endif /* _THE_NILFS_H */ >> diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h >> index 6ffdc09..a3c7593 100644 >> --- a/include/linux/nilfs2_fs.h >> +++ b/include/linux/nilfs2_fs.h >> @@ -222,11 +222,13 @@ struct nilfs_super_block { >> */ >> #define NILFS_FEATURE_COMPAT_SUFILE_EXTENSION (1ULL << 0) >> #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL << 1) >> +#define NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS (1ULL << 2) >> >> #define NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT (1ULL << 0) >> >> #define NILFS_FEATURE_COMPAT_SUPP (NILFS_FEATURE_COMPAT_SUFILE_EXTENSION \ >> - | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS) >> + | NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS \ >> + | NILFS_FEATURE_COMPAT_TRACK_SNAPSHOTS) >> #define NILFS_FEATURE_COMPAT_RO_SUPP NILFS_FEATURE_COMPAT_RO_BLOCK_COUNT >> #define NILFS_FEATURE_INCOMPAT_SUPP 0ULL >> > > You don't have to add three compat flags just for this one patchset. > Please unify it. > > #define NILFS_FEATURE_COMPAT_TRACK_LIVE_BLKS (1ULL << 0) > > looks to be enough. I could merge the TRACK_LIVE_BLKS and TRACK_SNAPSHOTS flag, but I would suggest to at least leave the SUFILE_EXTENSION flag (maybe with a different name). The SUFILE_EXTENSION flag has to be set at mkfs time and it cannot be set or removed later, because you cannot change the on disk format later. I actually set SUFILE_EXTENSION by default in mkfs, because it is not harmful and it gives the user the option to switch the other flags on later. Regards, Andreas Rohner > Regards, > Ryusuke Konishi > > >> @@ -630,7 +632,7 @@ struct nilfs_segment_usage { >> __le32 su_nblocks; >> __le32 su_flags; >> __le32 su_nlive_blks; >> - __le32 su_pad; >> + __le32 su_nsnapshot_blks; >> __le64 su_nlive_lastmod; >> }; >> >> @@ -682,7 +684,7 @@ nilfs_segment_usage_set_clean(struct nilfs_segment_usage *su, size_t susz) >> su->su_flags = cpu_to_le32(0); >> if (susz >= NILFS_EXT_SEGMENT_USAGE_SIZE) { >> su->su_nlive_blks = cpu_to_le32(0); >> - su->su_pad = cpu_to_le32(0); >> + su->su_nsnapshot_blks = cpu_to_le32(0); >> su->su_nlive_lastmod = cpu_to_le64(0); >> } >> } >> @@ -723,7 +725,7 @@ struct nilfs_suinfo { >> __u32 sui_nblocks; >> __u32 sui_flags; >> __u32 sui_nlive_blks; >> - __u32 sui_pad; >> + __u32 sui_nsnapshot_blks; >> __u64 sui_nlive_lastmod; >> }; >> >> @@ -770,6 +772,7 @@ enum { >> NILFS_SUINFO_UPDATE_FLAGS, >> NILFS_SUINFO_UPDATE_NLIVE_BLKS, >> NILFS_SUINFO_UPDATE_NLIVE_LASTMOD, >> + NILFS_SUINFO_UPDATE_NSNAPSHOT_BLKS, >> __NR_NILFS_SUINFO_UPDATE_FIELDS, >> }; >> >> @@ -794,6 +797,7 @@ NILFS_SUINFO_UPDATE_FNS(LASTMOD, lastmod) >> NILFS_SUINFO_UPDATE_FNS(NBLOCKS, nblocks) >> NILFS_SUINFO_UPDATE_FNS(FLAGS, flags) >> NILFS_SUINFO_UPDATE_FNS(NLIVE_BLKS, nlive_blks) >> +NILFS_SUINFO_UPDATE_FNS(NSNAPSHOT_BLKS, nsnapshot_blks) >> NILFS_SUINFO_UPDATE_FNS(NLIVE_LASTMOD, nlive_lastmod) >> >> #define NILFS_MIN_SUINFO_UPDATE_SIZE \ >> -- >> 2.3.0 >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in >> the body of a message to majordomo@xxxxxxxxxxxxxxx >> More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html