On Wed, Apr 10, 2024 at 10:29:38PM +0800, Zhang Yi wrote: > From: Zhang Yi <yi.zhang@xxxxxxxxxx> > > Implement buffered write iomap path, use ext4_da_map_blocks() to map > delalloc extents and add ext4_iomap_get_blocks() to allocate blocks if > delalloc is disabled or free space is about to run out. > > Note that we always allocate unwritten extents for new blocks in the > iomap write path, this means that the allocation type is no longer > controlled by the dioread_nolock mount option. After that, we could > postpone the i_disksize updating to the writeback path, and drop journal > handle in the buffered dealloc write path completely. > > Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx> > --- > fs/ext4/ext4.h | 3 + > fs/ext4/file.c | 19 +++++- > fs/ext4/inode.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++-- > 3 files changed, 183 insertions(+), 7 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 05949a8136ae..2bd543c43341 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -2970,6 +2970,7 @@ int ext4_walk_page_buffers(handle_t *handle, > struct buffer_head *bh)); > int do_journal_get_write_access(handle_t *handle, struct inode *inode, > struct buffer_head *bh); > +int ext4_nonda_switch(struct super_block *sb); > #define FALL_BACK_TO_NONDELALLOC 1 > #define CONVERT_INLINE_DATA 2 > > @@ -3827,6 +3828,8 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) > extern const struct iomap_ops ext4_iomap_ops; > extern const struct iomap_ops ext4_iomap_overwrite_ops; > extern const struct iomap_ops ext4_iomap_report_ops; > +extern const struct iomap_ops ext4_iomap_buffered_write_ops; > +extern const struct iomap_ops ext4_iomap_buffered_da_write_ops; > > static inline int ext4_buffer_uptodate(struct buffer_head *bh) > { > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > index 54d6ff22585c..52f37c49572a 100644 > --- a/fs/ext4/file.c > +++ b/fs/ext4/file.c > @@ -282,6 +282,20 @@ static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from) > return count; > } > > +static ssize_t ext4_iomap_buffered_write(struct kiocb *iocb, > + struct iov_iter *from) > +{ > + struct inode *inode = file_inode(iocb->ki_filp); > + const struct iomap_ops *iomap_ops; > + > + if (test_opt(inode->i_sb, DELALLOC) && !ext4_nonda_switch(inode->i_sb)) > + iomap_ops = &ext4_iomap_buffered_da_write_ops; > + else > + iomap_ops = &ext4_iomap_buffered_write_ops; > + > + return iomap_file_buffered_write(iocb, from, iomap_ops); > +} > + > static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, > struct iov_iter *from) > { > @@ -296,7 +310,10 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb, > if (ret <= 0) > goto out; > > - ret = generic_perform_write(iocb, from); > + if (ext4_test_inode_state(inode, EXT4_STATE_BUFFERED_IOMAP)) > + ret = ext4_iomap_buffered_write(iocb, from); > + else > + ret = generic_perform_write(iocb, from); > > out: > inode_unlock(inode); > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c > index 20eb772f4f62..e825ed16fd60 100644 > --- a/fs/ext4/inode.c > +++ b/fs/ext4/inode.c > @@ -2857,7 +2857,7 @@ static int ext4_dax_writepages(struct address_space *mapping, > return ret; > } > > -static int ext4_nonda_switch(struct super_block *sb) > +int ext4_nonda_switch(struct super_block *sb) > { > s64 free_clusters, dirty_clusters; > struct ext4_sb_info *sbi = EXT4_SB(sb); > @@ -3254,6 +3254,15 @@ static bool ext4_inode_datasync_dirty(struct inode *inode) > return inode->i_state & I_DIRTY_DATASYNC; > } > > +static bool ext4_iomap_valid(struct inode *inode, const struct iomap *iomap) > +{ > + return iomap->validity_cookie == READ_ONCE(EXT4_I(inode)->i_es_seq); > +} > + > +static const struct iomap_folio_ops ext4_iomap_folio_ops = { > + .iomap_valid = ext4_iomap_valid, > +}; > + > static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, > struct ext4_map_blocks *map, loff_t offset, > loff_t length, unsigned int flags) > @@ -3284,6 +3293,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, > !ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) > iomap->flags |= IOMAP_F_MERGED; > > + iomap->validity_cookie = READ_ONCE(EXT4_I(inode)->i_es_seq); > + iomap->folio_ops = &ext4_iomap_folio_ops; > + > /* > * Flags passed to ext4_map_blocks() for direct I/O writes can result > * in m_flags having both EXT4_MAP_MAPPED and EXT4_MAP_UNWRITTEN bits > @@ -3523,11 +3535,42 @@ const struct iomap_ops ext4_iomap_report_ops = { > .iomap_begin = ext4_iomap_begin_report, > }; > > -static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset, > +static int ext4_iomap_get_blocks(struct inode *inode, > + struct ext4_map_blocks *map) > +{ > + handle_t *handle; > + int ret, needed_blocks; > + > + /* > + * Reserve one block more for addition to orphan list in case > + * we allocate blocks but write fails for some reason. > + */ > + needed_blocks = ext4_writepage_trans_blocks(inode) + 1; > + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks); > + if (IS_ERR(handle)) > + return PTR_ERR(handle); > + > + ret = ext4_map_blocks(handle, inode, map, > + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); > + /* > + * Have to stop journal here since there is a potential deadlock > + * caused by later balance_dirty_pages(), it might wait on the > + * ditry pages to be written back, which might start another > + * handle and wait this handle stop. > + */ > + ext4_journal_stop(handle); > + > + return ret; > +} > + > +#define IOMAP_F_EXT4_DELALLOC IOMAP_F_PRIVATE > + > +static int __ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset, > loff_t length, unsigned int iomap_flags, > - struct iomap *iomap, struct iomap *srcmap) > + struct iomap *iomap, struct iomap *srcmap, > + bool delalloc) > { > - int ret; > + int ret, retries = 0; > struct ext4_map_blocks map; > u8 blkbits = inode->i_blkbits; > > @@ -3537,20 +3580,133 @@ static int ext4_iomap_buffered_io_begin(struct inode *inode, loff_t offset, > return -EINVAL; > if (WARN_ON_ONCE(ext4_has_inline_data(inode))) > return -ERANGE; > - > +retry: > /* Calculate the first and last logical blocks respectively. */ > map.m_lblk = offset >> blkbits; > map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, > EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; > + if (iomap_flags & IOMAP_WRITE) { > + if (delalloc) > + ret = ext4_da_map_blocks(inode, &map); > + else > + ret = ext4_iomap_get_blocks(inode, &map); > > - ret = ext4_map_blocks(NULL, inode, &map, 0); > + if (ret == -ENOSPC && > + ext4_should_retry_alloc(inode->i_sb, &retries)) > + goto retry; > + } else { > + ret = ext4_map_blocks(NULL, inode, &map, 0); > + } > if (ret < 0) > return ret; > > ext4_set_iomap(inode, iomap, &map, offset, length, iomap_flags); > + if (delalloc) > + iomap->flags |= IOMAP_F_EXT4_DELALLOC; > + > + return 0; > +} Why are you implementing both read and write mapping paths in the one function? The whole point of having separate ops vectors for read and write is that it allows a clean separation of the read and write mapping operations. i.e. there is no need to use "if (write) else {do read}" code constructs at all. You can even have a different delalloc mapping function so you don't need "if (delalloc) else {do nonda}" branches everiywhere... > + > +static inline int ext4_iomap_buffered_io_begin(struct inode *inode, > + loff_t offset, loff_t length, unsigned int flags, > + struct iomap *iomap, struct iomap *srcmap) > +{ > + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags, > + iomap, srcmap, false); > +} > + > +static inline int ext4_iomap_buffered_da_write_begin(struct inode *inode, > + loff_t offset, loff_t length, unsigned int flags, > + struct iomap *iomap, struct iomap *srcmap) > +{ > + return __ext4_iomap_buffered_io_begin(inode, offset, length, flags, > + iomap, srcmap, true); > +} > + > +/* > + * Drop the staled delayed allocation range from the write failure, > + * including both start and end blocks. If not, we could leave a range > + * of delayed extents covered by a clean folio, it could lead to > + * inaccurate space reservation. > + */ > +static int ext4_iomap_punch_delalloc(struct inode *inode, loff_t offset, > + loff_t length) > +{ > + ext4_es_remove_extent(inode, offset >> inode->i_blkbits, > + DIV_ROUND_UP_ULL(length, EXT4_BLOCK_SIZE(inode->i_sb))); > return 0; > } > > +static int ext4_iomap_buffered_write_end(struct inode *inode, loff_t offset, > + loff_t length, ssize_t written, > + unsigned int flags, > + struct iomap *iomap) > +{ > + handle_t *handle; > + loff_t end; > + int ret = 0, ret2; > + > + /* delalloc */ > + if (iomap->flags & IOMAP_F_EXT4_DELALLOC) { > + ret = iomap_file_buffered_write_punch_delalloc(inode, iomap, > + offset, length, written, ext4_iomap_punch_delalloc); > + if (ret) > + ext4_warning(inode->i_sb, > + "Failed to clean up delalloc for inode %lu, %d", > + inode->i_ino, ret); > + return ret; > + } Why are you creating a delalloc extent for the write operation and then immediately deleting it from the extent tree once the write operation is done? Delayed allocation is a state that persists for that file range until the data is written back, right, but this appears to return the range to a hole in the extent state tree? What happens when we do a second write to this same range? WIll it do another delayed allocation because there are no extents over this range? Also, why do you need IOMAP_F_EXT4_DELALLOC? Isn't a delalloc iomap set up with iomap->type = IOMAP_DELALLOC? Why can't that be used? -Dave. -- Dave Chinner david@xxxxxxxxxxxxx