The patch increases the parallelism of mb_cache_entry utilization by replacing list_head with hlist_bl_node for the implementation of both the block and index hash tables. Each hlist_bl_node contains a built-in lock used to protect mb_cache's local block and index hash chains. The global data mb_cache_lru_list and mb_cache_list continue to be protected by the global mb_cache_spinlock. Signed-off-by: T. Makphaibulchoke <tmac@xxxxxx> --- Changed in v3: - Removed all hash lock macros. - Fixed a possible race condition updating the e_used and e_queued members of an mb_cache_entry between mb_cache_entry_get function, traversing an block hash chain, and mb_cache_entry_find function, traversing an index hash chain. Changed in v2: - As per Linus Torvalds' suggestion, instead of allocating spinlock arrays to protect the hash chains, use hlist_bl_head, which already contains builtin lock. fs/mbcache.c | 306 +++++++++++++++++++++++++++++++++++------------- include/linux/mbcache.h | 10 +- 2 files changed, 229 insertions(+), 87 deletions(-) diff --git a/fs/mbcache.c b/fs/mbcache.c index 8c32ef3..dd45fe9 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -26,6 +26,38 @@ * back on the lru list. */ +/* + * Lock descriptions and usage: + * + * Each hash chain of both the block and index hash tables now contains + * a built-in lock used to serialize accesses to the hash chain. + * + * Accesses to global data structures mb_cache_list and mb_cache_lru_list + * are serialized via the global spinlock mb_cache_spinlock. + * + * Lock ordering: + * + * Each block hash chain's lock has the highest order, followed by each + * index hash chain's lock, with mb_cache_spinlock the lowest. + * While holding a block hash chain lock a thread can acquire either + * an index hash chain lock or mb_cache_spinlock. + * + * Synchronization: + * + * Since both the e_used and e_queued members of each mb_cache_entry can + * be updated while traversing either a block hash chain or an index hash + * chain and the index hash chain lock has lower oder, each index hash + * chain's lock, in addition to being used to serialize accesses to the + * index hash chain, is also used to serialize accesses to both e_used + * and e_queued. + * + * To avoid having a dangling reference to an already freed + * mb_cache_entry, an mb_cache_entry is only freed when it is not on a + * block hash chain and also no longer being referenced. Both e_used + * and e_queued are 0's. When an mb_cache_entry is explicitly + * freed it is first removed from a block hash chain. + */ + #include <linux/kernel.h> #include <linux/module.h> @@ -35,9 +67,9 @@ #include <linux/slab.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/list_bl.h> #include <linux/mbcache.h> - #ifdef MB_CACHE_DEBUG # define mb_debug(f...) do { \ printk(KERN_DEBUG f); \ @@ -99,23 +131,34 @@ static struct shrinker mb_cache_shrinker = { }; static inline int -__mb_cache_entry_is_hashed(struct mb_cache_entry *ce) +__mb_cache_entry_is_block_hashed(struct mb_cache_entry *ce) { - return !list_empty(&ce->e_block_list); + return !hlist_bl_unhashed(&ce->e_block_list); } static void -__mb_cache_entry_unhash(struct mb_cache_entry *ce) +__mb_cache_entry_unhash_block(struct mb_cache_entry *ce) { - if (__mb_cache_entry_is_hashed(ce)) { - list_del_init(&ce->e_block_list); - list_del(&ce->e_index.o_list); - } + if (__mb_cache_entry_is_block_hashed(ce)) + hlist_bl_del_init(&ce->e_block_list); +} + +static inline int +__mb_cache_entry_is_index_hashed(struct mb_cache_entry *ce) +{ + return !hlist_bl_unhashed(&ce->e_index.o_list); } static void +__mb_cache_entry_unhash_index(struct mb_cache_entry *ce) +{ + if (__mb_cache_entry_is_index_hashed(ce)) + hlist_bl_del(&ce->e_index.o_list); +} + +static void __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) { struct mb_cache *cache = ce->e_cache; @@ -128,8 +171,8 @@ __mb_cache_entry_forget(struct mb_cache_entry *ce, gfp_t gfp_mask) static void __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) - __releases(mb_cache_spinlock) { + hlist_bl_lock(ce->e_index_hash_p); /* Wake up all processes queuing for this cache entry. */ if (ce->e_queued) wake_up_all(&mb_cache_queue); @@ -137,15 +180,20 @@ __mb_cache_entry_release_unlock(struct mb_cache_entry *ce) ce->e_used -= MB_CACHE_WRITER; ce->e_used--; if (!(ce->e_used || ce->e_queued)) { - if (!__mb_cache_entry_is_hashed(ce)) + hlist_bl_unlock(ce->e_index_hash_p); + if (!__mb_cache_entry_is_block_hashed(ce)) goto forget; + spin_lock(&mb_cache_spinlock); mb_assert(list_empty(&ce->e_lru_list)); list_add_tail(&ce->e_lru_list, &mb_cache_lru_list); - } - spin_unlock(&mb_cache_spinlock); + spin_unlock(&mb_cache_spinlock); + } else + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); return; forget: - spin_unlock(&mb_cache_spinlock); + hlist_bl_unlock(ce->e_block_hash_p); + mb_assert(list_empty(&ce->e_lru_list)); __mb_cache_entry_forget(ce, GFP_KERNEL); } @@ -164,31 +212,46 @@ forget: static int mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc) { - LIST_HEAD(free_list); struct mb_cache *cache; - struct mb_cache_entry *entry, *tmp; int count = 0; int nr_to_scan = sc->nr_to_scan; gfp_t gfp_mask = sc->gfp_mask; mb_debug("trying to free %d entries", nr_to_scan); - spin_lock(&mb_cache_spinlock); - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { - struct mb_cache_entry *ce = - list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); + while (nr_to_scan > 0) { + struct mb_cache_entry *ce; + + spin_lock(&mb_cache_spinlock); + if (list_empty(&mb_cache_lru_list)) { + spin_unlock(&mb_cache_spinlock); + break; + } + ce = list_entry(mb_cache_lru_list.next, + struct mb_cache_entry, e_lru_list); + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + if (!(ce->e_used || ce->e_queued)) { + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_forget(ce, gfp_mask); + --nr_to_scan; + } else { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + } } + spin_lock(&mb_cache_spinlock); list_for_each_entry(cache, &mb_cache_list, c_cache_list) { mb_debug("cache %s (%d)", cache->c_name, atomic_read(&cache->c_entry_count)); count += atomic_read(&cache->c_entry_count); } spin_unlock(&mb_cache_spinlock); - list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) { - __mb_cache_entry_forget(entry, gfp_mask); - } return (count / 100) * sysctl_vfs_cache_pressure; } @@ -216,18 +279,18 @@ mb_cache_create(const char *name, int bucket_bits) cache->c_name = name; atomic_set(&cache->c_entry_count, 0); cache->c_bucket_bits = bucket_bits; - cache->c_block_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + cache->c_block_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_block_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_block_hash[n]); - cache->c_index_hash = kmalloc(bucket_count * sizeof(struct list_head), - GFP_KERNEL); + INIT_HLIST_BL_HEAD(&cache->c_block_hash[n]); + cache->c_index_hash = kmalloc(bucket_count * + sizeof(struct hlist_bl_head), GFP_KERNEL); if (!cache->c_index_hash) goto fail; for (n=0; n<bucket_count; n++) - INIT_LIST_HEAD(&cache->c_index_hash[n]); + INIT_HLIST_BL_HEAD(&cache->c_index_hash[n]); cache->c_entry_cache = kmem_cache_create(name, sizeof(struct mb_cache_entry), 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL); @@ -276,13 +339,24 @@ mb_cache_shrink(struct block_device *bdev) list_entry(l, struct mb_cache_entry, e_lru_list); if (ce->e_bdev == bdev) { list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); } } spin_unlock(&mb_cache_spinlock); list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_lru_list); + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + if (!(ce->e_used || ce->e_queued)) { + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_forget(ce, GFP_KERNEL); + } else { + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + } } } @@ -306,15 +380,21 @@ mb_cache_destroy(struct mb_cache *cache) list_entry(l, struct mb_cache_entry, e_lru_list); if (ce->e_cache == cache) { list_move_tail(&ce->e_lru_list, &free_list); - __mb_cache_entry_unhash(ce); } } list_del(&cache->c_cache_list); spin_unlock(&mb_cache_spinlock); list_for_each_safe(l, ltmp, &free_list) { - __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, - e_lru_list), GFP_KERNEL); + struct mb_cache_entry *ce = + list_entry(l, struct mb_cache_entry, e_lru_list); + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + __mb_cache_entry_forget(ce, GFP_KERNEL); } if (atomic_read(&cache->c_entry_count) > 0) { @@ -344,12 +424,27 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) struct mb_cache_entry *ce = NULL; if (atomic_read(&cache->c_entry_count) >= cache->c_max_entries) { + struct list_head *l; + struct list_head *ltmp; + spin_lock(&mb_cache_spinlock); - if (!list_empty(&mb_cache_lru_list)) { - ce = list_entry(mb_cache_lru_list.next, - struct mb_cache_entry, e_lru_list); - list_del_init(&ce->e_lru_list); - __mb_cache_entry_unhash(ce); + list_for_each_safe(l, ltmp, &mb_cache_lru_list) { + ce = list_entry(l, struct mb_cache_entry, e_lru_list); + if (ce->e_cache == cache) { + list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); + if (!(ce->e_used || ce->e_queued)) { + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); + hlist_bl_unlock(ce->e_block_hash_p); + break; + } + hlist_bl_unlock(ce->e_index_hash_p); + hlist_bl_unlock(ce->e_block_hash_p); + } } spin_unlock(&mb_cache_spinlock); } @@ -359,9 +454,12 @@ mb_cache_entry_alloc(struct mb_cache *cache, gfp_t gfp_flags) return NULL; atomic_inc(&cache->c_entry_count); INIT_LIST_HEAD(&ce->e_lru_list); - INIT_LIST_HEAD(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_block_list); + INIT_HLIST_BL_NODE(&ce->e_index.o_list); ce->e_cache = cache; ce->e_queued = 0; + ce->e_block_hash_p = &cache->c_block_hash[0]; + ce->e_index_hash_p = &cache->c_index_hash[0]; } ce->e_used = 1 + MB_CACHE_WRITER; return ce; @@ -388,28 +486,37 @@ mb_cache_entry_insert(struct mb_cache_entry *ce, struct block_device *bdev, { struct mb_cache *cache = ce->e_cache; unsigned int bucket; - struct list_head *l; + struct hlist_bl_node *l; int error = -EBUSY; + struct hlist_bl_head *block_hash_p; + struct hlist_bl_head *index_hash_p; + struct mb_cache_entry *lce; bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each_prev(l, &cache->c_block_hash[bucket]) { - struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_block_list); - if (ce->e_bdev == bdev && ce->e_block == block) + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(lce, l, block_hash_p, e_block_list) { + if (lce->e_bdev == bdev && lce->e_block == block) goto out; } - __mb_cache_entry_unhash(ce); + mb_assert(!__mb_cache_entry_is_hashed(ce)); + __mb_cache_entry_unhash_index(ce); + __mb_cache_entry_unhash_block(ce); ce->e_bdev = bdev; ce->e_block = block; - list_add(&ce->e_block_list, &cache->c_block_hash[bucket]); + hlist_bl_add_head(&ce->e_block_list, block_hash_p); + ce->e_block_hash_p = block_hash_p; ce->e_index.o_key = key; bucket = hash_long(key, cache->c_bucket_bits); - list_add(&ce->e_index.o_list, &cache->c_index_hash[bucket]); + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + hlist_bl_add_head(&ce->e_index.o_list, index_hash_p); + ce->e_index_hash_p = index_hash_p; + hlist_bl_unlock(index_hash_p); error = 0; out: - spin_unlock(&mb_cache_spinlock); + hlist_bl_unlock(block_hash_p); return error; } @@ -424,7 +531,7 @@ out: void mb_cache_entry_release(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); + hlist_bl_lock(ce->e_block_hash_p); __mb_cache_entry_release_unlock(ce); } @@ -438,9 +545,13 @@ mb_cache_entry_release(struct mb_cache_entry *ce) void mb_cache_entry_free(struct mb_cache_entry *ce) { - spin_lock(&mb_cache_spinlock); + mb_assert(ce); + hlist_bl_lock(ce->e_block_hash_p); + hlist_bl_lock(ce->e_index_hash_p); mb_assert(list_empty(&ce->e_lru_list)); - __mb_cache_entry_unhash(ce); + __mb_cache_entry_unhash_index(ce); + hlist_bl_unlock(ce->e_index_hash_p); + __mb_cache_entry_unhash_block(ce); __mb_cache_entry_release_unlock(ce); } @@ -458,33 +569,46 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, sector_t block) { unsigned int bucket; - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *block_hash_p; + int held_block_lock = 1; bucket = hash_long((unsigned long)bdev + (block & 0xffffffff), cache->c_bucket_bits); - spin_lock(&mb_cache_spinlock); - list_for_each(l, &cache->c_block_hash[bucket]) { - ce = list_entry(l, struct mb_cache_entry, e_block_list); + block_hash_p = &cache->c_block_hash[bucket]; + hlist_bl_lock(block_hash_p); + hlist_bl_for_each_entry(ce, l, block_hash_p, e_block_list) { + mb_assert(ce->e_block_hash_p == block_hash_p); if (ce->e_bdev == bdev && ce->e_block == block) { DEFINE_WAIT(wait); + spin_lock(&mb_cache_spinlock); if (!list_empty(&ce->e_lru_list)) list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); + hlist_bl_lock(ce->e_index_hash_p); while (ce->e_used > 0) { ce->e_queued++; + hlist_bl_unlock(ce->e_index_hash_p); prepare_to_wait(&mb_cache_queue, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); + if (held_block_lock) { + hlist_bl_unlock(block_hash_p); + held_block_lock = 0; + } schedule(); - spin_lock(&mb_cache_spinlock); + hlist_bl_lock(ce->e_index_hash_p); ce->e_queued--; } - finish_wait(&mb_cache_queue, &wait); ce->e_used += 1 + MB_CACHE_WRITER; + hlist_bl_unlock(ce->e_index_hash_p); + finish_wait(&mb_cache_queue, &wait); - if (!__mb_cache_entry_is_hashed(ce)) { + if (!held_block_lock) + hlist_bl_lock(block_hash_p); + if (!__mb_cache_entry_is_block_hashed(ce)) { __mb_cache_entry_release_unlock(ce); return NULL; } @@ -494,24 +618,27 @@ mb_cache_entry_get(struct mb_cache *cache, struct block_device *bdev, ce = NULL; cleanup: - spin_unlock(&mb_cache_spinlock); + hlist_bl_unlock(block_hash_p); return ce; } #if !defined(MB_CACHE_INDEXES_COUNT) || (MB_CACHE_INDEXES_COUNT > 0) static struct mb_cache_entry * -__mb_cache_entry_find(struct list_head *l, struct list_head *head, +__mb_cache_entry_find(struct hlist_bl_node *l, struct hlist_bl_head *head, struct block_device *bdev, unsigned int key) { - while (l != head) { + while (l != NULL) { struct mb_cache_entry *ce = - list_entry(l, struct mb_cache_entry, e_index.o_list); + hlist_bl_entry(l, struct mb_cache_entry, + e_index.o_list); if (ce->e_bdev == bdev && ce->e_index.o_key == key) { DEFINE_WAIT(wait); + spin_lock(&mb_cache_spinlock); if (!list_empty(&ce->e_lru_list)) list_del_init(&ce->e_lru_list); + spin_unlock(&mb_cache_spinlock); /* Incrementing before holding the lock gives readers priority over writers. */ @@ -520,18 +647,23 @@ __mb_cache_entry_find(struct list_head *l, struct list_head *head, ce->e_queued++; prepare_to_wait(&mb_cache_queue, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&mb_cache_spinlock); + hlist_bl_unlock(head); schedule(); - spin_lock(&mb_cache_spinlock); + hlist_bl_lock(head); + mb_assert(ce->e_index_hash_p == head); ce->e_queued--; } + hlist_bl_unlock(head); finish_wait(&mb_cache_queue, &wait); - if (!__mb_cache_entry_is_hashed(ce)) { + hlist_bl_lock(ce->e_block_hash_p); + if (!__mb_cache_entry_is_block_hashed(ce)) { __mb_cache_entry_release_unlock(ce); - spin_lock(&mb_cache_spinlock); + hlist_bl_lock(head); return ERR_PTR(-EAGAIN); - } + } else + hlist_bl_unlock(ce->e_block_hash_p); + mb_assert(ce->e_index_hash_p == head); return ce; } l = l->next; @@ -557,13 +689,17 @@ mb_cache_entry_find_first(struct mb_cache *cache, struct block_device *bdev, unsigned int key) { unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; - struct mb_cache_entry *ce; + struct hlist_bl_node *l; + struct mb_cache_entry *ce = NULL; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); - l = cache->c_index_hash[bucket].next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - spin_unlock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + hlist_bl_lock(index_hash_p); + if (!hlist_bl_empty(index_hash_p)) { + l = hlist_bl_first(index_hash_p); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + } + hlist_bl_unlock(index_hash_p); return ce; } @@ -592,13 +728,17 @@ mb_cache_entry_find_next(struct mb_cache_entry *prev, { struct mb_cache *cache = prev->e_cache; unsigned int bucket = hash_long(key, cache->c_bucket_bits); - struct list_head *l; + struct hlist_bl_node *l; struct mb_cache_entry *ce; + struct hlist_bl_head *index_hash_p; - spin_lock(&mb_cache_spinlock); + index_hash_p = &cache->c_index_hash[bucket]; + mb_assert(prev->e_index_hash_p == index_hash_p); + hlist_bl_lock(index_hash_p); + mb_assert(!hlist_bl_empty(index_hash_p)); l = prev->e_index.o_list.next; - ce = __mb_cache_entry_find(l, &cache->c_index_hash[bucket], bdev, key); - __mb_cache_entry_release_unlock(prev); + ce = __mb_cache_entry_find(l, index_hash_p, bdev, key); + hlist_bl_unlock(index_hash_p); return ce; } diff --git a/include/linux/mbcache.h b/include/linux/mbcache.h index 5525d37..89826c0 100644 --- a/include/linux/mbcache.h +++ b/include/linux/mbcache.h @@ -11,11 +11,13 @@ struct mb_cache_entry { unsigned short e_queued; struct block_device *e_bdev; sector_t e_block; - struct list_head e_block_list; + struct hlist_bl_node e_block_list; struct { - struct list_head o_list; + struct hlist_bl_node o_list; unsigned int o_key; } e_index; + struct hlist_bl_head *e_block_hash_p; + struct hlist_bl_head *e_index_hash_p; }; struct mb_cache { @@ -25,8 +27,8 @@ struct mb_cache { int c_max_entries; int c_bucket_bits; struct kmem_cache *c_entry_cache; - struct list_head *c_block_hash; - struct list_head *c_index_hash; + struct hlist_bl_head *c_block_hash; + struct hlist_bl_head *c_index_hash; }; /* Functions on caches */ -- 1.7.11.3 -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html