From: Zhang Yi <yi.zhang@xxxxxxxxxx> In the iomap_write_iter(), the iomap buffered write frame does not hold any locks between querying the inode extent mapping info and performing page cache writes. As a result, the extent mapping can be changed due to concurrent I/O in flight. Similarly, in the iomap_writepage_map(), the write-back process faces a similar problem: concurrent changes can invalidate the extent mapping before the I/O is submitted. Therefore, both of these processes must recheck the mapping info after acquiring the folio lock. To address this, similar to XFS, we propose introducing an extent sequence number to serve as a validity cookie for the extent. We will increment this number whenever the extent status tree changes, thereby preparing for the buffered write iomap conversion. Besides, it also changes the trace code style to make checkpatch.pl happy. Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx> --- fs/ext4/ext4.h | 1 + fs/ext4/extents_status.c | 13 ++++++++- fs/ext4/super.c | 1 + include/trace/events/ext4.h | 57 +++++++++++++++++++++---------------- 4 files changed, 46 insertions(+), 26 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6d0267afd4c1..44f6867d3037 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1123,6 +1123,7 @@ struct ext4_inode_info { ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for extents to shrink. Protected by i_es_lock */ + unsigned int i_es_seq; /* Change counter for extents */ /* ialloc */ ext4_group_t i_last_alloc_group; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index c786691dabd3..bea4f87db502 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -204,6 +204,13 @@ static inline ext4_lblk_t ext4_es_end(struct extent_status *es) return es->es_lblk + es->es_len - 1; } +static inline void ext4_es_inc_seq(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + + WRITE_ONCE(ei->i_es_seq, READ_ONCE(ei->i_es_seq) + 1); +} + /* * search through the tree for an delayed extent with a given offset. If * it can't be found, try to find next extent. @@ -872,6 +879,7 @@ void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, BUG_ON(end < lblk); WARN_ON_ONCE(status & EXTENT_STATUS_DELAYED); + ext4_es_inc_seq(inode); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, pblk, status); @@ -1519,13 +1527,15 @@ void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) return; - trace_ext4_es_remove_extent(inode, lblk, len); es_debug("remove [%u/%u) from extent status tree of inode %lu\n", lblk, len, inode->i_ino); if (!len) return; + ext4_es_inc_seq(inode); + trace_ext4_es_remove_extent(inode, lblk, len); + end = lblk + len - 1; BUG_ON(end < lblk); @@ -2107,6 +2117,7 @@ void ext4_es_insert_delayed_extent(struct inode *inode, ext4_lblk_t lblk, WARN_ON_ONCE((EXT4_B2C(sbi, lblk) == EXT4_B2C(sbi, end)) && end_allocated); + ext4_es_inc_seq(inode); newes.es_lblk = lblk; newes.es_len = len; ext4_es_store_pblock_status(&newes, ~0, EXTENT_STATUS_DELAYED); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 16a4ce704460..a01e0bbe57c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1409,6 +1409,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_es_all_nr = 0; ei->i_es_shk_nr = 0; ei->i_es_shrink_lblk = 0; + ei->i_es_seq = 0; ei->i_reserved_data_blocks = 0; spin_lock_init(&(ei->i_block_reservation_lock)); ext4_init_pending_tree(&ei->i_pending_tree); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 156908641e68..6f2bf9035216 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -2176,12 +2176,13 @@ DECLARE_EVENT_CLASS(ext4__es_extent, TP_ARGS(inode, es), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) - __field( char, status ) + __field(dev_t, dev) + __field(ino_t, ino) + __field(ext4_lblk_t, lblk) + __field(ext4_lblk_t, len) + __field(ext4_fsblk_t, pblk) + __field(char, status) + __field(unsigned int, seq) ), TP_fast_assign( @@ -2191,13 +2192,15 @@ DECLARE_EVENT_CLASS(ext4__es_extent, __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, - __entry->pblk, show_extent_status(__entry->status)) + __entry->pblk, show_extent_status(__entry->status), + __entry->seq) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, @@ -2218,10 +2221,11 @@ TRACE_EVENT(ext4_es_remove_extent, TP_ARGS(inode, lblk, len), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( loff_t, lblk ) - __field( loff_t, len ) + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, lblk) + __field(loff_t, len) + __field(unsigned int, seq) ), TP_fast_assign( @@ -2229,12 +2233,13 @@ TRACE_EVENT(ext4_es_remove_extent, __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%lld/%lld)", + TP_printk("dev %d,%d ino %lu es [%lld/%lld) seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, - __entry->lblk, __entry->len) + __entry->lblk, __entry->len, __entry->seq) ); TRACE_EVENT(ext4_es_find_extent_range_enter, @@ -2486,14 +2491,15 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, TP_ARGS(inode, es, lclu_allocated, end_allocated), TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( ext4_lblk_t, lblk ) - __field( ext4_lblk_t, len ) - __field( ext4_fsblk_t, pblk ) - __field( char, status ) - __field( bool, lclu_allocated ) - __field( bool, end_allocated ) + __field(dev_t, dev) + __field(ino_t, ino) + __field(ext4_lblk_t, lblk) + __field(ext4_lblk_t, len) + __field(ext4_fsblk_t, pblk) + __field(char, status) + __field(bool, lclu_allocated) + __field(bool, end_allocated) + __field(unsigned int, seq) ), TP_fast_assign( @@ -2505,15 +2511,16 @@ TRACE_EVENT(ext4_es_insert_delayed_extent, __entry->status = ext4_es_status(es); __entry->lclu_allocated = lclu_allocated; __entry->end_allocated = end_allocated; + __entry->seq = EXT4_I(inode)->i_es_seq; ), - TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " - "allocated %d %d", + TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s allocated %d %d seq %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), - __entry->lclu_allocated, __entry->end_allocated) + __entry->lclu_allocated, __entry->end_allocated, + __entry->seq) ); /* fsmap traces */ -- 2.46.1