Hi, Not so long ago we noticed that journal replay can take quite a lot (hours) in cases where many journaled blocks were freed during a short period. I benchmarked hash table used by revoke code, basically it’s lookup+insert like jbd2 does at replay: 1048576 records - 95 seconds 2097152 records - 580 seconds Then I benchmarked rhashtable: 1048576 records - 2 seconds 2097152 records - 3 seconds 4194304 records - 7 seconds So, here is a patch replacing existing fixed-size hash table with rhashtable, please have a look. Thanks, Alex >From 64b9161db7fee4eea665833765221d8a7e5903b6 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> Date: Tue, 31 Aug 2021 11:53:09 +0300 Subject: [PATCH] jbd2 to replace fixed-size revoke hashtable with rhashtable Signed-off-by: Alex Zhuravlev <bzzz@xxxxxxxxxxxxx> --- fs/jbd2/journal.c | 5 +- fs/jbd2/revoke.c | 255 +++++++++++++++---------------------------- include/linux/jbd2.h | 8 +- 3 files changed, 90 insertions(+), 178 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 35302bc192eb..6e3a2cc9dfd2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1370,7 +1370,7 @@ static journal_t *journal_init_common(struct block_device *bdev, journal->j_flags = JBD2_ABORT; /* Set up a default-sized revoke table for the new mount. */ - err = jbd2_journal_init_revoke(journal, JOURNAL_REVOKE_DEFAULT_HASH); + err = jbd2_journal_init_revoke(journal); if (err) goto err_cleanup; @@ -3136,8 +3136,6 @@ static int __init journal_init_caches(void) int ret; ret = jbd2_journal_init_revoke_record_cache(); - if (ret == 0) - ret = jbd2_journal_init_revoke_table_cache(); if (ret == 0) ret = jbd2_journal_init_journal_head_cache(); if (ret == 0) @@ -3152,7 +3150,6 @@ static int __init journal_init_caches(void) static void jbd2_journal_destroy_caches(void) { jbd2_journal_destroy_revoke_record_cache(); - jbd2_journal_destroy_revoke_table_cache(); jbd2_journal_destroy_journal_head_cache(); jbd2_journal_destroy_handle_cache(); jbd2_journal_destroy_inode_cache(); diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index fa608788b93d..4533bd49e879 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -90,10 +90,10 @@ #include <linux/bio.h> #include <linux/log2.h> #include <linux/hash.h> +#include <linux/rhashtable.h> #endif static struct kmem_cache *jbd2_revoke_record_cache; -static struct kmem_cache *jbd2_revoke_table_cache; /* Each revoke record represents one single revoked block. During journal replay, this involves recording the transaction ID of the @@ -101,23 +101,17 @@ static struct kmem_cache *jbd2_revoke_table_cache; struct jbd2_revoke_record_s { - struct list_head hash; + struct rhash_head linkage; tid_t sequence; /* Used for recovery only */ unsigned long long blocknr; }; - -/* The revoke table is just a simple hash table of revoke records. */ -struct jbd2_revoke_table_s -{ - /* It is conceivable that we might want a larger hash table - * for recovery. Must be a power of two. */ - int hash_size; - int hash_shift; - struct list_head *hash_table; +static const struct rhashtable_params revoke_rhashtable_params = { + .key_len = sizeof(unsigned long long), + .key_offset = offsetof(struct jbd2_revoke_record_s, blocknr), + .head_offset = offsetof(struct jbd2_revoke_record_s, linkage), }; - #ifdef __KERNEL__ static void write_one_revoke_record(transaction_t *, struct list_head *, @@ -126,18 +120,10 @@ static void write_one_revoke_record(transaction_t *, static void flush_descriptor(journal_t *, struct buffer_head *, int); #endif -/* Utility functions to maintain the revoke table */ - -static inline int hash(journal_t *journal, unsigned long long block) -{ - return hash_64(block, journal->j_revoke->hash_shift); -} - static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, tid_t seq) { - struct list_head *hash_list; - struct jbd2_revoke_record_s *record; + struct jbd2_revoke_record_s *record, *old; gfp_t gfp_mask = GFP_NOFS; if (journal_oom_retry) @@ -148,10 +134,12 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, record->sequence = seq; record->blocknr = blocknr; - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - spin_lock(&journal->j_revoke_lock); - list_add(&record->hash, hash_list); - spin_unlock(&journal->j_revoke_lock); + old = rhashtable_lookup_get_insert_fast(journal->j_revoke, + &record->linkage, revoke_rhashtable_params); + if (old) { + BUG_ON(record->sequence != seq); + kmem_cache_free(jbd2_revoke_record_cache, record); + } return 0; } @@ -160,22 +148,8 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, unsigned long long blocknr) { - struct list_head *hash_list; - struct jbd2_revoke_record_s *record; - - hash_list = &journal->j_revoke->hash_table[hash(journal, blocknr)]; - - spin_lock(&journal->j_revoke_lock); - record = (struct jbd2_revoke_record_s *) hash_list->next; - while (&(record->hash) != hash_list) { - if (record->blocknr == blocknr) { - spin_unlock(&journal->j_revoke_lock); - return record; - } - record = (struct jbd2_revoke_record_s *) record->hash.next; - } - spin_unlock(&journal->j_revoke_lock); - return NULL; + return rhashtable_lookup_fast(journal->j_revoke, &blocknr, + revoke_rhashtable_params); } void jbd2_journal_destroy_revoke_record_cache(void) @@ -184,12 +158,6 @@ void jbd2_journal_destroy_revoke_record_cache(void) jbd2_revoke_record_cache = NULL; } -void jbd2_journal_destroy_revoke_table_cache(void) -{ - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; -} - int __init jbd2_journal_init_revoke_record_cache(void) { J_ASSERT(!jbd2_revoke_record_cache); @@ -203,85 +171,27 @@ int __init jbd2_journal_init_revoke_record_cache(void) return 0; } -int __init jbd2_journal_init_revoke_table_cache(void) -{ - J_ASSERT(!jbd2_revoke_table_cache); - jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s, - SLAB_TEMPORARY); - if (!jbd2_revoke_table_cache) { - pr_emerg("JBD2: failed to create revoke_table cache\n"); - return -ENOMEM; - } - return 0; -} - -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) -{ - int shift = 0; - int tmp = hash_size; - struct jbd2_revoke_table_s *table; - - table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); - if (!table) - goto out; - - while((tmp >>= 1UL) != 0UL) - shift++; - - table->hash_size = hash_size; - table->hash_shift = shift; - table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); - if (!table->hash_table) { - kmem_cache_free(jbd2_revoke_table_cache, table); - table = NULL; - goto out; - } - - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&table->hash_table[tmp]); - -out: - return table; -} - -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) -{ - int i; - struct list_head *hash_list; - - for (i = 0; i < table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT(list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(jbd2_revoke_table_cache, table); -} - /* Initialise the revoke table for a given journal to a given size. */ -int jbd2_journal_init_revoke(journal_t *journal, int hash_size) +int jbd2_journal_init_revoke(journal_t *journal) { - J_ASSERT(journal->j_revoke_table[0] == NULL); - J_ASSERT(is_power_of_2(hash_size)); + int rc; - journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[0]) + rc = rhashtable_init(&journal->j_revoke_table[0], &revoke_rhashtable_params); + if (rc) goto fail0; - journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); - if (!journal->j_revoke_table[1]) + rc = rhashtable_init(&journal->j_revoke_table[1], &revoke_rhashtable_params); + if (rc) goto fail1; - journal->j_revoke = journal->j_revoke_table[1]; + journal->j_revoke = &journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; fail1: - jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); - journal->j_revoke_table[0] = NULL; + rhashtable_destroy(&journal->j_revoke_table[0]); fail0: return -ENOMEM; } @@ -290,10 +200,8 @@ int jbd2_journal_init_revoke(journal_t *journal, int hash_size) void jbd2_journal_destroy_revoke(journal_t *journal) { journal->j_revoke = NULL; - if (journal->j_revoke_table[0]) - jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); - if (journal->j_revoke_table[1]) - jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); + rhashtable_destroy(&journal->j_revoke_table[0]); + rhashtable_destroy(&journal->j_revoke_table[1]); } @@ -446,9 +354,8 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) if (record) { jbd_debug(4, "cancelled existing revoke on " "blocknr %llu\n", (unsigned long long)bh->b_blocknr); - spin_lock(&journal->j_revoke_lock); - list_del(&record->hash); - spin_unlock(&journal->j_revoke_lock); + rhashtable_remove_fast(journal->j_revoke, &record->linkage, + revoke_rhashtable_params); kmem_cache_free(jbd2_revoke_record_cache, record); did_revoke = 1; } @@ -483,27 +390,29 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) */ void jbd2_clear_buffer_revoked_flags(journal_t *journal) { - struct jbd2_revoke_table_s *revoke = journal->j_revoke; - int i = 0; - - for (i = 0; i < revoke->hash_size; i++) { - struct list_head *hash_list; - struct list_head *list_entry; - hash_list = &revoke->hash_table[i]; - - list_for_each(list_entry, hash_list) { - struct jbd2_revoke_record_s *record; - struct buffer_head *bh; - record = (struct jbd2_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); - if (bh) { - clear_buffer_revoked(bh); - __brelse(bh); - } + struct rhashtable *revoke = journal->j_revoke; + struct jbd2_revoke_record_s *record; + struct rhashtable_iter iter; + + rhashtable_walk_enter(revoke, &iter); + rhashtable_walk_start(&iter); + while ((record = rhashtable_walk_next(&iter)) != NULL) { + struct buffer_head *bh; + + if (IS_ERR(record)) + continue; + rhashtable_walk_stop(&iter); + bh = __find_get_block(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); + if (bh) { + clear_buffer_revoked(bh); + __brelse(bh); } - } + rhashtable_walk_start(&iter); + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } /* journal_switch_revoke table select j_revoke for next transaction @@ -512,15 +421,12 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) */ void jbd2_journal_switch_revoke_table(journal_t *journal) { - int i; - - if (journal->j_revoke == journal->j_revoke_table[0]) - journal->j_revoke = journal->j_revoke_table[1]; + if (journal->j_revoke == &journal->j_revoke_table[0]) + journal->j_revoke = &journal->j_revoke_table[1]; else - journal->j_revoke = journal->j_revoke_table[0]; + journal->j_revoke = &journal->j_revoke_table[0]; - for (i = 0; i < journal->j_revoke->hash_size; i++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[i]); + /* XXX: check rhashtable is empty? reinitialize it? */ } /* @@ -533,31 +439,36 @@ void jbd2_journal_write_revoke_records(transaction_t *transaction, journal_t *journal = transaction->t_journal; struct buffer_head *descriptor; struct jbd2_revoke_record_s *record; - struct jbd2_revoke_table_s *revoke; - struct list_head *hash_list; - int i, offset, count; + struct rhashtable_iter iter; + struct rhashtable *revoke; + int offset, count; descriptor = NULL; offset = 0; count = 0; /* select revoke table for committing transaction */ - revoke = journal->j_revoke == journal->j_revoke_table[0] ? - journal->j_revoke_table[1] : journal->j_revoke_table[0]; - - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - - while (!list_empty(hash_list)) { - record = (struct jbd2_revoke_record_s *) - hash_list->next; + revoke = journal->j_revoke == &journal->j_revoke_table[0] ? + &journal->j_revoke_table[1] : &journal->j_revoke_table[0]; + + rhashtable_walk_enter(revoke, &iter); + rhashtable_walk_start(&iter); + while ((record = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(record)) + continue; + if (rhashtable_remove_fast(revoke, + &record->linkage, + revoke_rhashtable_params) == 0) { + rhashtable_walk_stop(&iter); write_one_revoke_record(transaction, log_bufs, &descriptor, &offset, record); + rhashtable_walk_start(&iter); count++; - list_del(&record->hash); kmem_cache_free(jbd2_revoke_record_cache, record); } } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); if (descriptor) flush_descriptor(journal, descriptor, offset); jbd_debug(1, "Wrote %d revoke records\n", count); @@ -725,19 +636,23 @@ int jbd2_journal_test_revoke(journal_t *journal, void jbd2_journal_clear_revoke(journal_t *journal) { - int i; - struct list_head *hash_list; + struct rhashtable_iter iter; struct jbd2_revoke_record_s *record; - struct jbd2_revoke_table_s *revoke; + struct rhashtable *revoke; revoke = journal->j_revoke; - for (i = 0; i < revoke->hash_size; i++) { - hash_list = &revoke->hash_table[i]; - while (!list_empty(hash_list)) { - record = (struct jbd2_revoke_record_s*) hash_list->next; - list_del(&record->hash); + rhashtable_walk_enter(revoke, &iter); + rhashtable_walk_start(&iter); + while ((record = rhashtable_walk_next(&iter)) != NULL) { + if (IS_ERR(record)) + continue; + if (rhashtable_remove_fast(revoke, + &record->linkage, + revoke_rhashtable_params) == 0) { kmem_cache_free(jbd2_revoke_record_cache, record); - } - } + } + } + rhashtable_walk_stop(&iter); + rhashtable_walk_exit(&iter); } diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd933c45281a..dcde4329cdd1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -29,6 +29,7 @@ #include <linux/bit_spinlock.h> #include <linux/blkdev.h> #include <crypto/hash.h> +#include <linux/rhashtable.h> #endif #define journal_oom_retry 1 @@ -1135,12 +1136,12 @@ struct journal_s * The revoke table - maintains the list of revoked blocks in the * current transaction. */ - struct jbd2_revoke_table_s *j_revoke; + struct rhashtable *j_revoke; /** * @j_revoke_table: Alternate revoke tables for j_revoke. */ - struct jbd2_revoke_table_s *j_revoke_table[2]; + struct rhashtable j_revoke_table[2]; /** * @j_wbuf: Array of bhs for jbd2_journal_commit_transaction. @@ -1625,8 +1626,7 @@ static inline void jbd2_free_inode(struct jbd2_inode *jinode) } /* Primary revoke support */ -#define JOURNAL_REVOKE_DEFAULT_HASH 256 -extern int jbd2_journal_init_revoke(journal_t *, int); +extern int jbd2_journal_init_revoke(journal_t *); extern void jbd2_journal_destroy_revoke_record_cache(void); extern void jbd2_journal_destroy_revoke_table_cache(void); extern int __init jbd2_journal_init_revoke_record_cache(void); -- 2.26.3