On Apr 2, 2022, at 2:40 AM, anserper@xxxxx wrote: > > From: Andrew Perepechko <andrew.perepechko@xxxxxxx> > > When changing a large xattr value to a different large xattr value, > the old xattr inode is freed. Truncate during the final iput causes > current transaction restart. Eventually, parent inode bh is marked > dirty and kernel panic happens when jbd2 figures out that this bh > belongs to the committed transaction. > > A possible fix is to call this final iput in a separate thread. > This way, setxattr transactions will never be split into two. > Since the setxattr code adds xattr inodes with nlink=0 into the > orphan list, old xattr inodes will be properly cleaned up in > any case. > > Signed-off-by: Andrew Perepechko <andrew.perepechko@xxxxxxx> Reviewed-by: Andreas Dilger <adilger@xxxxxxxxx> > HPE-bug-id: LUS-10534 > > Changes since v1: > - fixed a bug added during the porting > - fixed a workqueue related deadlock reported by Tetsuo Handa > --- > fs/ext4/ext4.h | 7 +++++-- > fs/ext4/page-io.c | 2 +- > fs/ext4/super.c | 15 ++++++++------- > fs/ext4/xattr.c | 39 +++++++++++++++++++++++++++++++++++++-- > 4 files changed, 51 insertions(+), 12 deletions(-) > > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h > index 3f87cca49f0c..52db5d6bae7f 100644 > --- a/fs/ext4/ext4.h > +++ b/fs/ext4/ext4.h > @@ -1650,8 +1650,11 @@ struct ext4_sb_info { > struct flex_groups * __rcu *s_flex_groups; > ext4_group_t s_flex_groups_allocated; > > - /* workqueue for reserved extent conversions (buffered io) */ > - struct workqueue_struct *rsv_conversion_wq; > + /* > + * workqueue for reserved extent conversions (buffered io) > + * and large ea inodes reclaim > + */ > + struct workqueue_struct *s_misc_wq; > > /* timer for periodic error stats printing */ > struct timer_list s_err_report; > diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c > index 495ce59fb4ad..0142b88471ff 100644 > --- a/fs/ext4/page-io.c > +++ b/fs/ext4/page-io.c > @@ -228,7 +228,7 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) > WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); > WARN_ON(!io_end->handle && sbi->s_journal); > spin_lock_irqsave(&ei->i_completed_io_lock, flags); > - wq = sbi->rsv_conversion_wq; > + wq = sbi->s_misc_wq; > if (list_empty(&ei->i_rsv_conversion_list)) > queue_work(wq, &ei->i_rsv_conversion_work); > list_add_tail(&io_end->list, &ei->i_rsv_conversion_list); > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index 81749eaddf4c..ee03f593b264 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1200,10 +1200,11 @@ static void ext4_put_super(struct super_block *sb) > int i, err; > > ext4_unregister_li_request(sb); > + flush_workqueue(sbi->s_misc_wq); > ext4_quota_off_umount(sb); > > flush_work(&sbi->s_error_work); > - destroy_workqueue(sbi->rsv_conversion_wq); > + destroy_workqueue(sbi->s_misc_wq); > ext4_release_orphan_info(sb); > > /* > @@ -5294,9 +5295,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > * The maximum number of concurrent works can be high and > * concurrency isn't really necessary. Limit it to 1. > */ > - EXT4_SB(sb)->rsv_conversion_wq = > - alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > - if (!EXT4_SB(sb)->rsv_conversion_wq) { > + EXT4_SB(sb)->s_misc_wq = > + alloc_workqueue("ext4-misc", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); > + if (!EXT4_SB(sb)->s_misc_wq) { > printk(KERN_ERR "EXT4-fs: failed to create workqueue\n"); > ret = -ENOMEM; > goto failed_mount4; > @@ -5514,8 +5515,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) > sb->s_root = NULL; > failed_mount4: > ext4_msg(sb, KERN_ERR, "mount failed"); > - if (EXT4_SB(sb)->rsv_conversion_wq) > - destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq); > + if (EXT4_SB(sb)->s_misc_wq) > + destroy_workqueue(EXT4_SB(sb)->s_misc_wq); > failed_mount_wq: > ext4_xattr_destroy_cache(sbi->s_ea_inode_cache); > sbi->s_ea_inode_cache = NULL; > @@ -6129,7 +6130,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) > return 0; > > trace_ext4_sync_fs(sb, wait); > - flush_workqueue(sbi->rsv_conversion_wq); > + flush_workqueue(sbi->s_misc_wq); > /* > * Writeback quota in non-journalled quota case - journalled quota has > * no dirty dquots > diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c > index 042325349098..ee13675fbead 100644 > --- a/fs/ext4/xattr.c > +++ b/fs/ext4/xattr.c > @@ -1544,6 +1544,36 @@ static int ext4_xattr_inode_lookup_create(handle_t *handle, struct inode *inode, > return 0; > } > > +struct delayed_iput_work { > + struct work_struct work; > + struct inode *inode; > +}; > + > +static void delayed_iput_fn(struct work_struct *work) > +{ > + struct delayed_iput_work *diwork; > + > + diwork = container_of(work, struct delayed_iput_work, work); > + iput(diwork->inode); > + kfree(diwork); > +} > + > +static void delayed_iput(struct inode *inode, struct delayed_iput_work *work) > +{ > + if (!inode) { > + kfree(work); > + return; > + } > + > + if (!work) { > + iput(inode); > + } else { > + INIT_WORK(&work->work, delayed_iput_fn); > + work->inode = inode; > + queue_work(EXT4_SB(inode->i_sb)->s_misc_wq, &work->work); > + } > +} > + > /* > * Reserve min(block_size/8, 1024) bytes for xattr entries/names if ea_inode > * feature is enabled. > @@ -1561,6 +1591,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > int in_inode = i->in_inode; > struct inode *old_ea_inode = NULL; > struct inode *new_ea_inode = NULL; > + struct delayed_iput_work *diwork = NULL; > size_t old_size, new_size; > int ret; > > @@ -1637,7 +1668,11 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > * Finish that work before doing any modifications to the xattr data. > */ > if (!s->not_found && here->e_value_inum) { > - ret = ext4_xattr_inode_iget(inode, > + diwork = kmalloc(sizeof(*diwork), GFP_NOFS); > + if (!diwork) > + ret = -ENOMEM; > + else > + ret = ext4_xattr_inode_iget(inode, > le32_to_cpu(here->e_value_inum), > le32_to_cpu(here->e_hash), > &old_ea_inode); > @@ -1790,7 +1825,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, > > ret = 0; > out: > - iput(old_ea_inode); > + delayed_iput(old_ea_inode, diwork); > iput(new_ea_inode); > return ret; > } > -- > 2.25.1 > Cheers, Andreas
Attachment:
signature.asc
Description: Message signed with OpenPGP