On Thu 10-06-21 19:24:37, Zhang Yi wrote: > Current metadata buffer release logic in bdev_try_to_free_page() have > a lot of use-after-free issues when umount filesystem concurrently, and > it is difficult to fix directly because ext4 is the only user of > s_op->bdev_try_to_free_page callback and we may have to add more special > refcount or lock that is only used by ext4 into the common vfs layer, > which is unacceptable. > > One better solution is remove the bdev_try_to_free_page callback, but > the real problem is we cannot easily release journal_head on the > checkpointed buffer, so try_to_free_buffers() cannot release buffers and > page under memory pressure, which is more likely to trigger > out-of-memory. So we cannot remove the callback directly before we find > another way to release journal_head. > > This patch introduce a shrinker to free journal_head on the checkpointed > transaction. After the journal_head got freed, try_to_free_buffers() > could free buffer properly. > > Signed-off-by: Zhang Yi <yi.zhang@xxxxxxxxxx> > Suggested-by: Jan Kara <jack@xxxxxxx> Looks good. Feel free to add: Reviewed-by: Jan Kara <jack@xxxxxxx> Honza > --- > fs/ext4/super.c | 8 ++ > fs/jbd2/checkpoint.c | 147 ++++++++++++++++++++++++++++++++++++ > fs/jbd2/journal.c | 87 +++++++++++++++++++++ > include/linux/jbd2.h | 26 +++++++ > include/trace/events/jbd2.h | 101 +++++++++++++++++++++++++ > 5 files changed, 369 insertions(+) > > diff --git a/fs/ext4/super.c b/fs/ext4/super.c > index d29f6aa7d96e..80064e566f56 100644 > --- a/fs/ext4/super.c > +++ b/fs/ext4/super.c > @@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb) > ext4_unregister_sysfs(sb); > > if (sbi->s_journal) { > + jbd2_journal_unregister_shrinker(sbi->s_journal); > aborted = is_journal_aborted(sbi->s_journal); > err = jbd2_journal_destroy(sbi->s_journal); > sbi->s_journal = NULL; > @@ -5178,6 +5179,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) > sbi->s_ea_block_cache = NULL; > > if (sbi->s_journal) { > + jbd2_journal_unregister_shrinker(sbi->s_journal); > jbd2_journal_destroy(sbi->s_journal); > sbi->s_journal = NULL; > } > @@ -5504,6 +5506,12 @@ static int ext4_load_journal(struct super_block *sb, > ext4_commit_super(sb); > } > > + err = jbd2_journal_register_shrinker(journal); > + if (err) { > + EXT4_SB(sb)->s_journal = NULL; > + goto err_out; > + } > + > return 0; > > err_out: > diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c > index 75a4f622afaf..1abdae44a3d8 100644 > --- a/fs/jbd2/checkpoint.c > +++ b/fs/jbd2/checkpoint.c > @@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh) > transaction->t_checkpoint_io_list = jh; > } > > +/* > + * Check a checkpoint buffer could be release or not. > + * > + * Requires j_list_lock > + */ > +static inline bool __cp_buffer_busy(struct journal_head *jh) > +{ > + struct buffer_head *bh = jh2bh(jh); > + > + return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh)); > +} > + > /* > * Try to release a checkpointed buffer from its transaction. > * Returns 1 if we released it and 2 if we also released the > @@ -458,6 +470,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy) > return 0; > } > > +/* > + * journal_shrink_one_cp_list > + * > + * Find 'nr_to_scan' written-back checkpoint buffers in the given list > + * and try to release them. If the whole transaction is released, set > + * the 'released' parameter. Return the number of released checkpointed > + * buffers. > + * > + * Called with j_list_lock held. > + */ > +static unsigned long journal_shrink_one_cp_list(struct journal_head *jh, > + unsigned long *nr_to_scan, > + bool *released) > +{ > + struct journal_head *last_jh; > + struct journal_head *next_jh = jh; > + unsigned long nr_freed = 0; > + int ret; > + > + if (!jh || *nr_to_scan == 0) > + return 0; > + > + last_jh = jh->b_cpprev; > + do { > + jh = next_jh; > + next_jh = jh->b_cpnext; > + > + (*nr_to_scan)--; > + if (__cp_buffer_busy(jh)) > + continue; > + > + nr_freed++; > + ret = __jbd2_journal_remove_checkpoint(jh); > + if (ret) { > + *released = true; > + break; > + } > + > + if (need_resched()) > + break; > + } while (jh != last_jh && *nr_to_scan); > + > + return nr_freed; > +} > + > +/* > + * jbd2_journal_shrink_checkpoint_list > + * > + * Find 'nr_to_scan' written-back checkpoint buffers in the journal > + * and try to release them. Return the number of released checkpointed > + * buffers. > + * > + * Called with j_list_lock held. > + */ > +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, > + unsigned long *nr_to_scan) > +{ > + transaction_t *transaction, *last_transaction, *next_transaction; > + bool released; > + tid_t first_tid = 0, last_tid = 0, next_tid = 0; > + tid_t tid = 0; > + unsigned long nr_freed = 0; > + unsigned long nr_scanned = *nr_to_scan; > + > +again: > + spin_lock(&journal->j_list_lock); > + if (!journal->j_checkpoint_transactions) { > + spin_unlock(&journal->j_list_lock); > + goto out; > + } > + > + /* > + * Get next shrink transaction, resume previous scan or start > + * over again. If some others do checkpoint and drop transaction > + * from the checkpoint list, we ignore saved j_shrink_transaction > + * and start over unconditionally. > + */ > + if (journal->j_shrink_transaction) > + transaction = journal->j_shrink_transaction; > + else > + transaction = journal->j_checkpoint_transactions; > + > + if (!first_tid) > + first_tid = transaction->t_tid; > + last_transaction = journal->j_checkpoint_transactions->t_cpprev; > + next_transaction = transaction; > + last_tid = last_transaction->t_tid; > + do { > + transaction = next_transaction; > + next_transaction = transaction->t_cpnext; > + tid = transaction->t_tid; > + released = false; > + > + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list, > + nr_to_scan, &released); > + if (*nr_to_scan == 0) > + break; > + if (need_resched() || spin_needbreak(&journal->j_list_lock)) > + break; > + if (released) > + continue; > + > + nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list, > + nr_to_scan, &released); > + if (*nr_to_scan == 0) > + break; > + if (need_resched() || spin_needbreak(&journal->j_list_lock)) > + break; > + } while (transaction != last_transaction); > + > + if (transaction != last_transaction) { > + journal->j_shrink_transaction = next_transaction; > + next_tid = next_transaction->t_tid; > + } else { > + journal->j_shrink_transaction = NULL; > + next_tid = 0; > + } > + > + spin_unlock(&journal->j_list_lock); > + cond_resched(); > + > + if (*nr_to_scan && next_tid) > + goto again; > +out: > + nr_scanned -= *nr_to_scan; > + trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid, > + nr_freed, nr_scanned, next_tid); > + > + return nr_freed; > +} > + > /* > * journal_clean_checkpoint_list > * > @@ -580,6 +723,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) > > __buffer_unlink(jh); > jh->b_cp_transaction = NULL; > + percpu_counter_dec(&journal->j_jh_shrink_count); > jbd2_journal_put_journal_head(jh); > > /* Is this transaction empty? */ > @@ -642,6 +786,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, > jh->b_cpnext->b_cpprev = jh; > } > transaction->t_checkpoint_list = jh; > + percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count); > } > > /* > @@ -657,6 +802,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh, > void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction) > { > assert_spin_locked(&journal->j_list_lock); > + > + journal->j_shrink_transaction = NULL; > if (transaction->t_cpnext) { > transaction->t_cpnext->t_cpprev = transaction->t_cpprev; > transaction->t_cpprev->t_cpnext = transaction->t_cpnext; > diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c > index 90146755941f..3746bb4fc431 100644 > --- a/fs/jbd2/journal.c > +++ b/fs/jbd2/journal.c > @@ -1954,6 +1954,91 @@ int jbd2_journal_load(journal_t *journal) > return -EIO; > } > > +/** > + * jbd2_journal_shrink_scan() > + * > + * Scan the checkpointed buffer on the checkpoint list and release the > + * journal_head. > + */ > +static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, > + struct shrink_control *sc) > +{ > + journal_t *journal = container_of(shrink, journal_t, j_shrinker); > + unsigned long nr_to_scan = sc->nr_to_scan; > + unsigned long nr_shrunk; > + unsigned long count; > + > + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); > + trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count); > + > + nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan); > + > + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); > + trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count); > + > + return nr_shrunk; > +} > + > +/** > + * jbd2_journal_shrink_count() > + * > + * Count the number of checkpoint buffers on the checkpoint list. > + */ > +static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, > + struct shrink_control *sc) > +{ > + journal_t *journal = container_of(shrink, journal_t, j_shrinker); > + unsigned long count; > + > + count = percpu_counter_read_positive(&journal->j_jh_shrink_count); > + trace_jbd2_shrink_count(journal, sc->nr_to_scan, count); > + > + return count; > +} > + > +/** > + * jbd2_journal_register_shrinker() > + * @journal: Journal to act on. > + * > + * Init a percpu counter to record the checkpointed buffers on the checkpoint > + * list and register a shrinker to release their journal_head. > + */ > +int jbd2_journal_register_shrinker(journal_t *journal) > +{ > + int err; > + > + journal->j_shrink_transaction = NULL; > + > + err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL); > + if (err) > + return err; > + > + journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; > + journal->j_shrinker.count_objects = jbd2_journal_shrink_count; > + journal->j_shrinker.seeks = DEFAULT_SEEKS; > + journal->j_shrinker.batch = journal->j_max_transaction_buffers; > + > + err = register_shrinker(&journal->j_shrinker); > + if (err) { > + percpu_counter_destroy(&journal->j_jh_shrink_count); > + return err; > + } > + > + return 0; > +} > + > +/** > + * jbd2_journal_unregister_shrinker() > + * @journal: Journal to act on. > + * > + * Unregister the checkpointed buffer shrinker and destroy the percpu counter. > + */ > +void jbd2_journal_unregister_shrinker(journal_t *journal) > +{ > + percpu_counter_destroy(&journal->j_jh_shrink_count); > + unregister_shrinker(&journal->j_shrinker); > +} > + > /** > * jbd2_journal_destroy() - Release a journal_t structure. > * @journal: Journal to act on. > @@ -2026,6 +2111,8 @@ int jbd2_journal_destroy(journal_t *journal) > brelse(journal->j_sb_buffer); > } > > + jbd2_journal_unregister_shrinker(journal); > + > if (journal->j_proc_entry) > jbd2_stats_proc_exit(journal); > iput(journal->j_inode); > diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h > index f9b5e657b8f3..23578506215f 100644 > --- a/include/linux/jbd2.h > +++ b/include/linux/jbd2.h > @@ -909,6 +909,29 @@ struct journal_s > */ > struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; > > + /** > + * @j_shrinker: > + * > + * Journal head shrinker, reclaim buffer's journal head which > + * has been written back. > + */ > + struct shrinker j_shrinker; > + > + /** > + * @j_jh_shrink_count: > + * > + * Number of journal buffers on the checkpoint list. [j_list_lock] > + */ > + struct percpu_counter j_jh_shrink_count; > + > + /** > + * @j_shrink_transaction: > + * > + * Record next transaction will shrink on the checkpoint list. > + * [j_list_lock] > + */ > + transaction_t *j_shrink_transaction; > + > /** > * @j_head: > * > @@ -1418,6 +1441,7 @@ extern void jbd2_journal_commit_transaction(journal_t *); > > /* Checkpoint list management */ > void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy); > +unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan); > int __jbd2_journal_remove_checkpoint(struct journal_head *); > void jbd2_journal_destroy_checkpoint(journal_t *journal); > void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *); > @@ -1528,6 +1552,8 @@ extern int jbd2_journal_set_features > (journal_t *, unsigned long, unsigned long, unsigned long); > extern void jbd2_journal_clear_features > (journal_t *, unsigned long, unsigned long, unsigned long); > +extern int jbd2_journal_register_shrinker(journal_t *journal); > +extern void jbd2_journal_unregister_shrinker(journal_t *journal); > extern int jbd2_journal_load (journal_t *journal); > extern int jbd2_journal_destroy (journal_t *); > extern int jbd2_journal_recover (journal_t *journal); > diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h > index d16a32867f3a..a4dfe005983d 100644 > --- a/include/trace/events/jbd2.h > +++ b/include/trace/events/jbd2.h > @@ -394,6 +394,107 @@ TRACE_EVENT(jbd2_lock_buffer_stall, > __entry->stall_ms) > ); > > +DECLARE_EVENT_CLASS(jbd2_journal_shrink, > + > + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, > + unsigned long count), > + > + TP_ARGS(journal, nr_to_scan, count), > + > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(unsigned long, nr_to_scan) > + __field(unsigned long, count) > + ), > + > + TP_fast_assign( > + __entry->dev = journal->j_fs_dev->bd_dev; > + __entry->nr_to_scan = nr_to_scan; > + __entry->count = count; > + ), > + > + TP_printk("dev %d,%d nr_to_scan %lu count %lu", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + __entry->nr_to_scan, __entry->count) > +); > + > +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_count, > + > + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), > + > + TP_ARGS(journal, nr_to_scan, count) > +); > + > +DEFINE_EVENT(jbd2_journal_shrink, jbd2_shrink_scan_enter, > + > + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, unsigned long count), > + > + TP_ARGS(journal, nr_to_scan, count) > +); > + > +TRACE_EVENT(jbd2_shrink_scan_exit, > + > + TP_PROTO(journal_t *journal, unsigned long nr_to_scan, > + unsigned long nr_shrunk, unsigned long count), > + > + TP_ARGS(journal, nr_to_scan, nr_shrunk, count), > + > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(unsigned long, nr_to_scan) > + __field(unsigned long, nr_shrunk) > + __field(unsigned long, count) > + ), > + > + TP_fast_assign( > + __entry->dev = journal->j_fs_dev->bd_dev; > + __entry->nr_to_scan = nr_to_scan; > + __entry->nr_shrunk = nr_shrunk; > + __entry->count = count; > + ), > + > + TP_printk("dev %d,%d nr_to_scan %lu nr_shrunk %lu count %lu", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + __entry->nr_to_scan, __entry->nr_shrunk, > + __entry->count) > +); > + > +TRACE_EVENT(jbd2_shrink_checkpoint_list, > + > + TP_PROTO(journal_t *journal, tid_t first_tid, tid_t tid, tid_t last_tid, > + unsigned long nr_freed, unsigned long nr_scanned, > + tid_t next_tid), > + > + TP_ARGS(journal, first_tid, tid, last_tid, nr_freed, > + nr_scanned, next_tid), > + > + TP_STRUCT__entry( > + __field(dev_t, dev) > + __field(tid_t, first_tid) > + __field(tid_t, tid) > + __field(tid_t, last_tid) > + __field(unsigned long, nr_freed) > + __field(unsigned long, nr_scanned) > + __field(tid_t, next_tid) > + ), > + > + TP_fast_assign( > + __entry->dev = journal->j_fs_dev->bd_dev; > + __entry->first_tid = first_tid; > + __entry->tid = tid; > + __entry->last_tid = last_tid; > + __entry->nr_freed = nr_freed; > + __entry->nr_scanned = nr_scanned; > + __entry->next_tid = next_tid; > + ), > + > + TP_printk("dev %d,%d shrink transaction %u-%u(%u) freed %lu " > + "scanned %lu next transaction %u", > + MAJOR(__entry->dev), MINOR(__entry->dev), > + __entry->first_tid, __entry->tid, __entry->last_tid, > + __entry->nr_freed, __entry->nr_scanned, __entry->next_tid) > +); > + > #endif /* _TRACE_JBD2_H */ > > /* This part must be outside protection */ > -- > 2.31.1 > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR