On Sun, 2014-03-16 at 11:47 +0100, Andreas Rohner wrote: > This patch introduces new flags for nilfs_vdesc to indicate the reason a > block is alive. So if the block would be reclaimable, but must be > treated as if it were alive, because it is part of a snapshot, then the > snapshot flag is set. > I suppose that I don't quite follow your idea. As far as I can judge, every block in DAT file has: (1) de_start: start checkpoint number; (2) de_end: end checkpoint number. So, while one of checkpoint number is snapshot number then we know that this block lives in snapshot. Am I correct? Why do we need in special flags? > Additionally a new ioctl() is added, which enables the userspace GC to > perform a cleanup operation after setting the number of blocks with > NILFS_IOCTL_SET_SUINFO. It sets DAT entries with de_ss values of > NILFS_CNO_MAX to 0. NILFS_CNO_MAX indicates, that the corresponding > block belongs to some snapshot, but was already decremented by a > previous deletion operation. If the segment usage info is changed with > NILFS_IOCTL_SET_SUINFO and the number of blocks is updated, then these > blocks would never be decremented and there are scenarios where the > corresponding segments would starve (never be cleaned). To prevent that > they must be reset to 0. > > Signed-off-by: Andreas Rohner <andreas.rohner@xxxxxxx> > --- > fs/nilfs2/dat.c | 63 ++++++++++++++++++++++++++++ > fs/nilfs2/dat.h | 1 + > fs/nilfs2/ioctl.c | 103 +++++++++++++++++++++++++++++++++++++++++++++- > include/linux/nilfs2_fs.h | 52 ++++++++++++++++++++++- > 4 files changed, 216 insertions(+), 3 deletions(-) > > diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c > index 89a4a5f..7adb15d 100644 > --- a/fs/nilfs2/dat.c > +++ b/fs/nilfs2/dat.c > @@ -382,6 +382,69 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr) > } > > /** > + * nilfs_dat_clean_snapshot_flag - check flags used by snapshots > + * @dat: DAT file inode > + * @vblocknr: virtual block number > + * > + * Description: nilfs_dat_clean_snapshot_flag() changes the flags from > + * NILFS_CNO_MAX to 0 if necessary, so that segment usage is accurately > + * counted. NILFS_CNO_MAX indicates, that the corresponding block belongs > + * to some snapshot, but was already decremented. If the segment usage info > + * is changed with NILFS_IOCTL_SET_SUINFO and the number of blocks is updated, > + * then these blocks would never be decremented and there are scenarios where > + * the corresponding segments would starve (never be cleaned). > + * > + * Return Value: On success, 0 is returned. On error, one of the following > + * negative error codes is returned. > + * > + * %-EIO - I/O error. > + * > + * %-ENOMEM - Insufficient amount of memory available. > + */ > +int nilfs_dat_clean_snapshot_flag(struct inode *dat, __u64 vblocknr) Sounds likewise we clear flag. It can be confusing name. > +{ > + struct buffer_head *entry_bh; > + struct nilfs_dat_entry *entry; > + void *kaddr; > + int ret; > + > + ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh); > + if (ret < 0) > + return ret; > + > + /* > + * The given disk block number (blocknr) is not yet written to > + * the device at this point. > + * > + * To prevent nilfs_dat_translate() from returning the > + * uncommitted block number, this makes a copy of the entry > + * buffer and redirects nilfs_dat_translate() to the copy. > + */ > + if (!buffer_nilfs_redirected(entry_bh)) { > + ret = nilfs_mdt_freeze_buffer(dat, entry_bh); > + if (ret) { > + brelse(entry_bh); > + return ret; > + } > + } > + > + kaddr = kmap_atomic(entry_bh->b_page); > + entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr); > + if (entry->de_ss == cpu_to_le64(NILFS_CNO_MAX)) { > + entry->de_ss = cpu_to_le64(0); > + kunmap_atomic(kaddr); > + mark_buffer_dirty(entry_bh); > + nilfs_mdt_mark_dirty(dat); > + } else { > + kunmap_atomic(kaddr); > + } Brackets are unnecessary here. > + > + brelse(entry_bh); > + > + return 0; > +} > + > +/** > * nilfs_dat_translate - translate a virtual block number to a block number > * @dat: DAT file inode > * @vblocknr: virtual block number > diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h > index 92a187e..a528024 100644 > --- a/fs/nilfs2/dat.h > +++ b/fs/nilfs2/dat.h > @@ -51,6 +51,7 @@ void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, > int nilfs_dat_mark_dirty(struct inode *, __u64); > int nilfs_dat_freev(struct inode *, __u64 *, size_t); > int nilfs_dat_move(struct inode *, __u64, sector_t); > +int nilfs_dat_clean_snapshot_flag(struct inode *, __u64); > ssize_t nilfs_dat_get_vinfo(struct inode *, void *, unsigned, size_t); > > int nilfs_dat_read(struct super_block *sb, size_t entry_size, > diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c > index 422fb54..0b62bf4 100644 > --- a/fs/nilfs2/ioctl.c > +++ b/fs/nilfs2/ioctl.c > @@ -578,7 +578,7 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode, > struct buffer_head *bh; > int ret; > > - if (vdesc->vd_flags == 0) > + if (nilfs_vdesc_data(vdesc)) > ret = nilfs_gccache_submit_read_data( > inode, vdesc->vd_offset, vdesc->vd_blocknr, > vdesc->vd_vblocknr, &bh); > @@ -662,6 +662,14 @@ static int nilfs_ioctl_move_blocks(struct super_block *sb, > } > > do { > + /* > + * old user space tools to not initialize vd_flags2 > + * check if it contains invalid flags > + */ > + if (vdesc->vd_flags2 & "vd_flags2" is really bad naming. Completely obscure. > + (~0UL << __NR_NILFS_VDESC_FIELDS)) Looks weird. > + vdesc->vd_flags2 = 0; > + > ret = nilfs_ioctl_move_inode_block(inode, vdesc, > &buffers); > if (unlikely(ret < 0)) { > @@ -984,6 +992,96 @@ out: > } > > /** > + * nilfs_ioctl_clean_snapshot_flags - clean dat entries with invalid de_ss Ditto. Sounds likewise clearing of flag. > + * @inode: inode object > + * @filp: file object > + * @cmd: ioctl's request code > + * @argp: pointer on argument from userspace > + * > + * Description: nilfs_ioctl_clean_snapshot_flags() sets DAT entries with de_ss > + * values of NILFS_CNO_MAX to 0. NILFS_CNO_MAX indicates, that the > + * corresponding block belongs to some snapshot, but was already decremented. > + * If the segment usage info is changed with NILFS_IOCTL_SET_SUINFO and the > + * number of blocks is updated, then these blocks would never be decremented > + * and there are scenarios where the corresponding segments would starve (never > + * be cleaned). > + * > + * Return Value: On success, 0 is returned or error code, otherwise. > + */ > +static int nilfs_ioctl_clean_snapshot_flags(struct inode *inode, > + struct file *filp, > + unsigned int cmd, > + void __user *argp) > +{ > + struct the_nilfs *nilfs = inode->i_sb->s_fs_info; > + struct nilfs_transaction_info ti; > + struct nilfs_argv argv; > + struct nilfs_vdesc *vdesc; > + size_t len, i; > + void __user *base; > + void *kbuf; > + int ret; > + > + if (!capable(CAP_SYS_ADMIN)) > + return -EPERM; > + > + ret = mnt_want_write_file(filp); > + if (ret) > + return ret; > + > + ret = -EFAULT; > + if (copy_from_user(&argv, argp, sizeof(struct nilfs_argv))) > + goto out; > + > + ret = -EINVAL; > + if (argv.v_size != sizeof(struct nilfs_vdesc)) > + goto out; > + if (argv.v_nmembs > UINT_MAX / sizeof(struct nilfs_vdesc)) > + goto out; > + > + len = argv.v_size * argv.v_nmembs; > + if (!len) { > + ret = 0; > + goto out; > + } > + > + base = (void __user *)(unsigned long)argv.v_base; > + kbuf = vmalloc(len); > + if (!kbuf) { > + ret = -ENOMEM; > + goto out; > + } > + > + if (copy_from_user(kbuf, base, len)) { > + ret = -EFAULT; > + goto out_free; > + } > + > + ret = nilfs_transaction_begin(inode->i_sb, &ti, 0); > + if (unlikely(ret)) > + goto out_free; > + > + for (i = 0, vdesc = kbuf; i < argv.v_nmembs; ++i, ++vdesc) { > + if (nilfs_vdesc_snapshot(vdesc)) { > + ret = nilfs_dat_clean_snapshot_flag(nilfs->ns_dat, > + vdesc->vd_vblocknr); > + if (ret) { > + nilfs_transaction_abort(inode->i_sb); > + goto out_free; > + } > + } > + } > + > + nilfs_transaction_commit(inode->i_sb); > + > +out_free: > + vfree(kbuf); > +out: > + mnt_drop_write_file(filp); > + return ret; > +} > + > +/** > * nilfs_ioctl_sync - make a checkpoint > * @inode: inode object > * @filp: file object > @@ -1332,6 +1430,8 @@ long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) > return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp); > case NILFS_IOCTL_CLEAN_SEGMENTS: > return nilfs_ioctl_clean_segments(inode, filp, cmd, argp); > + case NILFS_IOCTL_CLEAN_SNAPSHOT_FLAGS: > + return nilfs_ioctl_clean_snapshot_flags(inode, filp, cmd, argp); > case NILFS_IOCTL_SYNC: > return nilfs_ioctl_sync(inode, filp, cmd, argp); > case NILFS_IOCTL_RESIZE: > @@ -1368,6 +1468,7 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) > case NILFS_IOCTL_GET_VINFO: > case NILFS_IOCTL_GET_BDESCS: > case NILFS_IOCTL_CLEAN_SEGMENTS: > + case NILFS_IOCTL_CLEAN_SNAPSHOT_FLAGS: Sounds for me that we clean all snapshot's flags. > case NILFS_IOCTL_SYNC: > case NILFS_IOCTL_RESIZE: > case NILFS_IOCTL_SET_ALLOC_RANGE: > diff --git a/include/linux/nilfs2_fs.h b/include/linux/nilfs2_fs.h > index ba9ebe02..30ddc86 100644 > --- a/include/linux/nilfs2_fs.h > +++ b/include/linux/nilfs2_fs.h > @@ -863,7 +863,7 @@ struct nilfs_vinfo { > * @vd_blocknr: disk block number > * @vd_offset: logical block offset inside a file > * @vd_flags: flags (data or node block) > - * @vd_pad: padding > + * @vd_flags2: additional flags Ditto. Weird name. > */ > struct nilfs_vdesc { > __u64 vd_ino; > @@ -873,9 +873,55 @@ struct nilfs_vdesc { > __u64 vd_blocknr; > __u64 vd_offset; > __u32 vd_flags; > - __u32 vd_pad; > + /* vd_flags2 needed because of backwards compatibility */ Completely, misunderstand comment. Usually, it keeps old fields for backward compatibility. But this flag is new. > + __u32 vd_flags2; > }; > > +/* vdesc flags */ To be honest, I misunderstand why such number of flags and why namely such flags? Comments are really necessary. > +enum { > + NILFS_VDESC_DATA, > + NILFS_VDESC_NODE, > + /* ... */ What does it mean? > +}; > +enum { > + NILFS_VDESC_SNAPSHOT, > + __NR_NILFS_VDESC_FIELDS, > + /* ... */ What does it mean? Thanks, Vyacheslav Dubeyko. > +}; > + > +#define NILFS_VDESC_FNS(flag, name) \ > +static inline void \ > +nilfs_vdesc_set_##name(struct nilfs_vdesc *vdesc) \ > +{ \ > + vdesc->vd_flags = NILFS_VDESC_##flag; \ > +} \ > +static inline int \ > +nilfs_vdesc_##name(const struct nilfs_vdesc *vdesc) \ > +{ \ > + return vdesc->vd_flags == NILFS_VDESC_##flag; \ > +} > + > +#define NILFS_VDESC_FNS2(flag, name) \ > +static inline void \ > +nilfs_vdesc_set_##name(struct nilfs_vdesc *vdesc) \ > +{ \ > + vdesc->vd_flags2 |= (1UL << NILFS_VDESC_##flag); \ > +} \ > +static inline void \ > +nilfs_vdesc_clear_##name(struct nilfs_vdesc *vdesc) \ > +{ \ > + vdesc->vd_flags2 &= ~(1UL << NILFS_VDESC_##flag); \ > +} \ > +static inline int \ > +nilfs_vdesc_##name(const struct nilfs_vdesc *vdesc) \ > +{ \ > + return !!(vdesc->vd_flags2 & (1UL << NILFS_VDESC_##flag)); \ > +} > + > +NILFS_VDESC_FNS(DATA, data) > +NILFS_VDESC_FNS(NODE, node) > +NILFS_VDESC_FNS2(SNAPSHOT, snapshot) > + > /** > * struct nilfs_bdesc - descriptor of disk block number > * @bd_ino: inode number > @@ -922,5 +968,7 @@ struct nilfs_bdesc { > _IOW(NILFS_IOCTL_IDENT, 0x8C, __u64[2]) > #define NILFS_IOCTL_SET_SUINFO \ > _IOW(NILFS_IOCTL_IDENT, 0x8D, struct nilfs_argv) > +#define NILFS_IOCTL_CLEAN_SNAPSHOT_FLAGS \ > + _IOW(NILFS_IOCTL_IDENT, 0x8F, struct nilfs_argv) > > #endif /* _LINUX_NILFS_FS_H */ -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html