Re: [PATCH v2 2/2] ext4: add ioctl FS_IOC_CHKPT_JRNL

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Apr 07, 2021 at 03:42:02PM +0000, Leah Rumancik wrote:
> ioctl FS_IOC_CHKPT_JRNL checkpoints and flushes the journal. With the
> CHKPT_JRNL_DISCARD flag set, the journal blocks are also discarded.
> With the filename wipeout patch, Ext4 guarantees that all data will be
> discarded on deletion. This ioctl allows for periodically discarding
> journal contents too.

This needs a documentation update to cover what this new userspace ABI
does, and probably linux-api and linux-fsdevel should be cc'd.

> Also, add journal discard (if discard supported) during journal load
> after recovery. This provides a potential solution to
> https://lore.kernel.org/linux-ext4/YDZoaacIYStFQT8g@xxxxxxx/ for
> disks that support discard. After a successful journal recovery, e2fsck
> can call this ioctl to discard the journal as well.
> 
> Signed-off-by: Leah Rumancik <leah.rumancik@xxxxxxxxx>
> ---
>  fs/ext4/ext4.h          |   1 +
>  fs/ext4/inode.c         |   4 +-
>  fs/ext4/ioctl.c         |  34 ++++++++++++--
>  fs/ext4/super.c         |   6 +--
>  fs/jbd2/journal.c       | 100 +++++++++++++++++++++++++++++++++++++++-
>  fs/ocfs2/alloc.c        |   2 +-
>  fs/ocfs2/journal.c      |   8 ++--
>  include/linux/jbd2.h    |   5 +-
>  include/uapi/linux/fs.h |   4 ++
>  9 files changed, 148 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 826a56e3bbd2..e76e9961e992 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -724,6 +724,7 @@ enum {
>  #define EXT4_IOC_CLEAR_ES_CACHE		_IO('f', 40)
>  #define EXT4_IOC_GETSTATE		_IOW('f', 41, __u32)
>  #define EXT4_IOC_GET_ES_CACHE		_IOWR('f', 42, struct fiemap)
> +/* ioctl code 43 reserved for FS_IOC_JRNL_CHKPT */

Pat Sajak hates it when people won't buy vowels. ;)

FS_IOC_CHECKPOINT, maybe?

>  
>  #define EXT4_IOC_SHUTDOWN _IOR ('X', 125, __u32)
>  
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 0948a43f1b3d..715077e30c58 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3225,7 +3225,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
>  		ext4_clear_inode_state(inode, EXT4_STATE_JDATA);
>  		journal = EXT4_JOURNAL(inode);
>  		jbd2_journal_lock_updates(journal);
> -		err = jbd2_journal_flush(journal);
> +		err = jbd2_journal_flush(journal, 0);
>  		jbd2_journal_unlock_updates(journal);
>  
>  		if (err)
> @@ -6007,7 +6007,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	if (val)
>  		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
>  	else {
> -		err = jbd2_journal_flush(journal);
> +		err = jbd2_journal_flush(journal, 0);
>  		if (err < 0) {
>  			jbd2_journal_unlock_updates(journal);
>  			percpu_up_write(&sbi->s_writepages_rwsem);
> diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
> index a2cf35066f46..ca64c680ef6d 100644
> --- a/fs/ext4/ioctl.c
> +++ b/fs/ext4/ioctl.c
> @@ -756,7 +756,7 @@ static long ext4_ioctl_group_add(struct file *file,
>  	err = ext4_group_add(sb, input);
>  	if (EXT4_SB(sb)->s_journal) {
>  		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> -		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> +		err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
>  		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
>  	}
>  	if (err == 0)
> @@ -939,7 +939,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  		err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
>  		if (EXT4_SB(sb)->s_journal) {
>  			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> -			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> +			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
>  			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
>  		}
>  		if (err == 0)
> @@ -1082,7 +1082,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  		if (EXT4_SB(sb)->s_journal) {
>  			ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_RESIZE);
>  			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> -			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> +			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
>  			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
>  		}
>  		if (err == 0)
> @@ -1318,6 +1318,33 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
>  			return -EOPNOTSUPP;
>  		return fsverity_ioctl_read_metadata(filp,
>  						    (const void __user *)arg);
> +	case FS_IOC_CHKPT_JRNL:
> +	{

Should this whole block be a separate static function?

> +		int err = 0;
> +		unsigned long long flags = 0;
> +
> +		if (!capable(CAP_SYS_ADMIN))
> +			return -EPERM;
> +
> +		/* file argument is not the mount point */
> +		if (file_dentry(filp) != sb->s_root)
> +			return -EINVAL;
> +
> +		/* filesystem is not backed by block device */
> +		if (sb->s_bdev == NULL)
> +			return -EINVAL;
> +
> +		if (copy_from_user(&flags, (__u64 __user *)arg,
> +					sizeof(__u64)))
> +			return -EFAULT;

Needs a check here to return -EINVAL if any flags bits are set other
than CHKPT_JRNL_DO_DISCARD, particularly since you're feeding flags
straight into jbd2 code.

FWIW I would also separate the changes into two patches -- one to
add the flags paramter to jbd2_journal_flush, and a second patch to
define the new ioctl and wire it up.

> +
> +		if (EXT4_SB(sb)->s_journal) {
> +			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> +			err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, flags);
> +			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
> +		}
> +		return err;
> +	}
>  
>  	default:
>  		return -ENOTTY;
> @@ -1407,6 +1434,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
>  	case EXT4_IOC_GET_ES_CACHE:
>  	case FS_IOC_FSGETXATTR:
>  	case FS_IOC_FSSETXATTR:
> +	case FS_IOC_CHKPT_JRNL:
>  		break;
>  	default:
>  		return -ENOIOCTLCMD;
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index b9693680463a..1b3a9eb58b63 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -5613,7 +5613,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb,
>  		return 0;
>  	}
>  	jbd2_journal_lock_updates(journal);
> -	err = jbd2_journal_flush(journal);
> +	err = jbd2_journal_flush(journal, 0);
>  	if (err < 0)
>  		goto out;
>  
> @@ -5755,7 +5755,7 @@ static int ext4_freeze(struct super_block *sb)
>  		 * Don't clear the needs_recovery flag if we failed to
>  		 * flush the journal.
>  		 */
> -		error = jbd2_journal_flush(journal);
> +		error = jbd2_journal_flush(journal, 0);
>  		if (error < 0)
>  			goto out;
>  
> @@ -6349,7 +6349,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
>  		 * otherwise be livelocked...
>  		 */
>  		jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
> -		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
> +		err = jbd2_journal_flush(EXT4_SB(sb)->s_journal, 0);
>  		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
>  		if (err)
>  			return err;
> diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
> index 2dc944442802..6bb5980ac789 100644
> --- a/fs/jbd2/journal.c
> +++ b/fs/jbd2/journal.c
> @@ -1686,6 +1686,90 @@ static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
>  	write_unlock(&journal->j_state_lock);
>  }
>  
> +/* discard journal blocks excluding journal superblock */
> +static int __jbd2_journal_issue_discard(journal_t *journal)
> +{
> +	int err = 0;
> +	unsigned long block, log_offset; /* logical */
> +	unsigned long long phys_block, block_start, block_stop; /* physical */
> +	loff_t byte_start, byte_stop, byte_count;
> +	struct request_queue *q = bdev_get_queue(journal->j_dev);
> +
> +	if (!q)
> +		return -ENXIO;
> +
> +	if (!blk_queue_discard(q))
> +		return -EOPNOTSUPP;
> +
> +	/* lookup block mapping and issue discard for each contiguous region */
> +	log_offset = be32_to_cpu(journal->j_superblock->s_first);
> +
> +	err = jbd2_journal_bmap(journal, log_offset, &block_start);
> +	if (err) {
> +		printk(KERN_ERR "JBD2: bad block at offset %lu", log_offset);
> +		return err;
> +	}
> +
> +	/*
> +	 * use block_start - 1 to meet check for contiguous with previous region:
> +	 * phys_block == block_stop + 1
> +	 */
> +	block_stop = block_start - 1;
> +
> +	for (block = log_offset; block < journal->j_total_len; block++) {
> +		err = jbd2_journal_bmap(journal, block, &phys_block);
> +		if (err) {
> +			printk(KERN_ERR "JBD2: bad block at offset %lu", block);
> +			return err;
> +		}
> +
> +		/*
> +		 * if block is last block, update stopping point
> +		 * if not last block and
> +		 * block is contiguous with previous block, continue
> +		 */
> +		if (block == journal->j_total_len - 1)
> +			block_stop = phys_block;
> +		else if (phys_block == block_stop + 1) {
> +			block_stop++;
> +			continue;
> +		}
> +
> +		/*
> +		 * if not contiguous with prior physical block or this is last
> +		 * block of journal, take care of the region
> +		 */
> +		byte_start = block_start * journal->j_blocksize;
> +		byte_stop = block_stop * journal->j_blocksize;
> +		byte_count = (block_stop - block_start + 1) *
> +			journal->j_blocksize;
> +
> +		truncate_inode_pages_range(journal->j_dev->bd_inode->i_mapping,
> +			byte_start, byte_stop);
> +
> +		/*
> +		 * use blkdev_issue_discard instead of sb_issue_discard
> +		 * because superblock not yet populated when this is
> +		 * called during journal_load during mount process
> +		 */
> +		err = blkdev_issue_discard(journal->j_dev,
> +			byte_start >> SECTOR_SHIFT,
> +			byte_count >> SECTOR_SHIFT,
> +			GFP_NOFS, 0);
> +
> +		if (unlikely(err != 0)) {
> +			printk(KERN_ERR "JBD2: unable to discard "
> +				"journal at physical blocks %llu - %llu",
> +				block_start, block_stop);
> +			return err;
> +		}
> +
> +		block_start = phys_block;
> +		block_stop = phys_block;
> +	}
> +
> +	return blkdev_issue_flush(journal->j_dev);
> +}
>  
>  /**
>   * jbd2_journal_update_sb_errno() - Update error in the journal.
> @@ -1936,6 +2020,10 @@ int jbd2_journal_load(journal_t *journal)
>  	 */
>  	journal->j_flags &= ~JBD2_ABORT;
>  
> +	/* if journal device supports discard, discard journal blocks */
> +	if (__jbd2_journal_issue_discard(journal))
> +		printk(KERN_WARNING "JBD2: failed to discard journal when loading");

How badly do you need to log this?  The journal still works, right?
This is going to trigger on all the old spinning disks that don't
support discard.

> +
>  	/* OK, we've finished with the dynamic journal bits:
>  	 * reinitialise the dynamic contents of the superblock in memory
>  	 * and reset them on disk. */
> @@ -2246,13 +2334,17 @@ EXPORT_SYMBOL(jbd2_journal_clear_features);
>  /**
>   * jbd2_journal_flush() - Flush journal
>   * @journal: Journal to act on.
> + * @flags: options (see below)
>   *
>   * Flush all data for a given journal to disk and empty the journal.
>   * Filesystems can use this when remounting readonly to ensure that
>   * recovery does not need to happen on remount.
> + *
> + * flags:
> + * JBD2_FLAG_DO_DISCARD: also discard the journal blocks; if discard is not
> + *	supported on the device, returns err
>   */
> -
> -int jbd2_journal_flush(journal_t *journal)
> +int jbd2_journal_flush(journal_t *journal, unsigned long long flags)
>  {
>  	int err = 0;
>  	transaction_t *transaction = NULL;
> @@ -2306,6 +2398,10 @@ int jbd2_journal_flush(journal_t *journal)
>  	 * commits of data to the journal will restore the current
>  	 * s_start value. */
>  	jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA);
> +
> +	if (flags & JBD2_FLAG_DO_DISCARD)
> +		err = __jbd2_journal_issue_discard(journal);
> +
>  	mutex_unlock(&journal->j_checkpoint_mutex);
>  	write_lock(&journal->j_state_lock);
>  	J_ASSERT(!journal->j_running_transaction);
> diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
> index 78710788c237..1b41bf9f4a7e 100644
> --- a/fs/ocfs2/alloc.c
> +++ b/fs/ocfs2/alloc.c
> @@ -6020,7 +6020,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
>  	 * Then truncate log will be replayed resulting in cluster double free.
>  	 */
>  	jbd2_journal_lock_updates(journal->j_journal);
> -	status = jbd2_journal_flush(journal->j_journal);
> +	status = jbd2_journal_flush(journal->j_journal, 0);
>  	jbd2_journal_unlock_updates(journal->j_journal);
>  	if (status < 0) {
>  		mlog_errno(status);
> diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
> index db52e843002a..a1438548747e 100644
> --- a/fs/ocfs2/journal.c
> +++ b/fs/ocfs2/journal.c
> @@ -310,7 +310,7 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb)
>  	}
>  
>  	jbd2_journal_lock_updates(journal->j_journal);
> -	status = jbd2_journal_flush(journal->j_journal);
> +	status = jbd2_journal_flush(journal->j_journal, 0);
>  	jbd2_journal_unlock_updates(journal->j_journal);
>  	if (status < 0) {
>  		up_write(&journal->j_trans_barrier);
> @@ -1002,7 +1002,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
>  
>  	if (ocfs2_mount_local(osb)) {
>  		jbd2_journal_lock_updates(journal->j_journal);
> -		status = jbd2_journal_flush(journal->j_journal);
> +		status = jbd2_journal_flush(journal->j_journal, 0);
>  		jbd2_journal_unlock_updates(journal->j_journal);
>  		if (status < 0)
>  			mlog_errno(status);
> @@ -1072,7 +1072,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
>  
>  	if (replayed) {
>  		jbd2_journal_lock_updates(journal->j_journal);
> -		status = jbd2_journal_flush(journal->j_journal);
> +		status = jbd2_journal_flush(journal->j_journal, 0);
>  		jbd2_journal_unlock_updates(journal->j_journal);
>  		if (status < 0)
>  			mlog_errno(status);
> @@ -1668,7 +1668,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
>  
>  	/* wipe the journal */
>  	jbd2_journal_lock_updates(journal);
> -	status = jbd2_journal_flush(journal);
> +	status = jbd2_journal_flush(journal, 0);
>  	jbd2_journal_unlock_updates(journal);
>  	if (status < 0)
>  		mlog_errno(status);
> diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
> index 99d3cd051ac3..50510473283e 100644
> --- a/include/linux/jbd2.h
> +++ b/include/linux/jbd2.h
> @@ -307,6 +307,9 @@ typedef struct journal_superblock_s
>  					JBD2_FEATURE_INCOMPAT_CSUM_V3 | \
>  					JBD2_FEATURE_INCOMPAT_FAST_COMMIT)
>  
> +/* discard journal blocks flag for jbd2_journal_flush */
> +#define JBD2_FLAG_DO_DISCARD		1
> +
>  #ifdef __KERNEL__
>  
>  #include <linux/fs.h>
> @@ -1491,7 +1494,7 @@ extern int	 jbd2_journal_invalidatepage(journal_t *,
>  				struct page *, unsigned int, unsigned int);
>  extern int	 jbd2_journal_try_to_free_buffers(journal_t *journal, struct page *page);
>  extern int	 jbd2_journal_stop(handle_t *);
> -extern int	 jbd2_journal_flush (journal_t *);
> +extern int	 jbd2_journal_flush(journal_t *journal, unsigned long long flags);
>  extern void	 jbd2_journal_lock_updates (journal_t *);
>  extern void	 jbd2_journal_unlock_updates (journal_t *);
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f44eb0a04afd..d66408c38c1d 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -214,6 +214,7 @@ struct fsxattr {
>  #define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
>  #define FS_IOC_GETFSLABEL		_IOR(0x94, 49, char[FSLABEL_MAX])
>  #define FS_IOC_SETFSLABEL		_IOW(0x94, 50, char[FSLABEL_MAX])
> +#define FS_IOC_CHKPT_JRNL		_IOW('f', 43, __u64)
>  
>  /*
>   * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS)
> @@ -304,4 +305,7 @@ typedef int __bitwise __kernel_rwf_t;
>  #define RWF_SUPPORTED	(RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
>  			 RWF_APPEND)
>  
> +/* flag for ioctl FS_IOC_JRNL_CHKPT */
> +#define CHKPT_JRNL_DO_DISCARD	1

FS_IOC_CHECKPOINT_FLAG_DISCARD, to go with FS_IOC_CHECKPOINT?

--D

> +
>  #endif /* _UAPI_LINUX_FS_H */
> -- 
> 2.31.0.208.g409f899ff0-goog
> 



[Index of Archives]     [Reiser Filesystem Development]     [Ceph FS]     [Kernel Newbies]     [Security]     [Netfilter]     [Bugtraq]     [Linux FS]     [Yosemite National Park]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux RAID]     [Samba]     [Device Mapper]     [Linux Media]

  Powered by Linux