The problem: If an inflight backing WRITE operation for a block is performed that meets the criteria for bypassing the cache and that takes a long time to complete, a READ operation for the same block may be fully processed in the interim that populates the cache with the device content from before the inflight WRITE. When the inflight WRITE finally completes, since it was marked for bypass, the cache is not subsequently updated, and the stale data populated by the READ request remains in cache. While there is code in bcache for invalidating the cache when a bypassed WRITE is performed, this is done prior to issuing the backing I/O so it does not help. The proposed fix: Add two new lists to the cached_dev structure to track inflight "bypass" write requests and inflight read requests that have have missed cache. These are called "inflight_bypass_write_list" and "inflight_read_list", respectively, and are protected by a spinlock called the "inflight_lock" When a WRITE is bypassing the cache, check whether there is an overlapping inflight read. If so, set bypass = false to essentially convert the bypass write into a writethrough write. Otherwise, if there is no overlapping inflight read, then add the "search" structure to the inflight bypass write list. When a READ misses cache, check whether there is an overlapping inflight write. If so, set a new flag in the search structure called "do_not_cache" which causes cache population to be skipped after the backing I/O completes. Otherwise, if there is no overlapping inflight write, then add the "search" structure to the inflight read list. The rest of the changes are to add a new stat called "bypass_cache_insert_races" to track how many times the race was encountered. Example: cat /sys/fs/bcache/0c9b7a62-b431-4328-9dcb-a81e322238af/bdev0/stats_total/cache_bypass_races 16577 Assuming this is the correct approach, areas to look at: 1) Searching linked lists doesn't scale. Can something like an interval tree be used here instead? 2) Can this be restructured so that the inflight_lock doesn't have to be accessed with interrupts disabled? Note that search_free() can be called in interrupt context. 3) Can do_not_cache just be another (1-bit) flag in the search structure instead of occupying its own "int" ? --- drivers/md/bcache/bcache.h | 4 ++ drivers/md/bcache/request.c | 79 ++++++++++++++++++++++++++++++++++++- drivers/md/bcache/stats.c | 14 +++++++ drivers/md/bcache/stats.h | 4 ++ drivers/md/bcache/super.c | 4 ++ 5 files changed, 104 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 848dd4db1659..15e9f9f9a9bc 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -397,6 +397,10 @@ struct cached_dev { unsigned int error_limit; unsigned int offline_seconds; + struct list_head inflight_bypass_write_list; + struct list_head inflight_read_list; + spinlock_t inflight_lock; + char backing_dev_name[BDEVNAME_SIZE]; }; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 29c231758293..734f25358f78 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -480,8 +480,59 @@ struct search { struct btree_op op; struct data_insert_op iop; + struct list_head inflight_node; + uint64_t req_start; + uint64_t req_end; + int do_not_cache; }; +static bool reqs_overlap(struct search *s1, struct search *s2) +{ + if ((s1->req_start <= s2->req_start && s1->req_end > s2->req_start) || + (s1->req_start > s2->req_end && s1->req_start < s2->req_end)) { + + return(true); + } + + return(false); +} + +static bool check_inflight_overlapping(struct search *s) +{ + struct cached_dev *dc = container_of(s->d, struct cached_dev, disk); + struct list_head *lh, *search_list, *insert_list; + struct search *entry; + unsigned long flags; + bool found = false; + + if (s->write) { + search_list = &dc->inflight_read_list; + insert_list = &dc->inflight_bypass_write_list; + } else { + search_list = &dc->inflight_bypass_write_list; + insert_list = &dc->inflight_read_list; + } + + spin_lock_irqsave(&dc->inflight_lock, flags); + list_for_each(lh, search_list) { + entry = list_entry(lh, struct search, inflight_node); + if (reqs_overlap(s, entry)) { + found = true; + break; + } + } + + if (found == false) { + list_add(insert_list, &s->inflight_node); + } + spin_unlock_irqrestore(&dc->inflight_lock, flags); + + if (found) { + bch_mark_cache_bypass_race(s->d->c, s->d); + } + return(found); +} + static void bch_cache_read_endio(struct bio *bio) { struct bbio *b = container_of(bio, struct bbio, bio); @@ -702,6 +753,15 @@ static void do_bio_hook(struct search *s, static void search_free(struct closure *cl) { struct search *s = container_of(cl, struct search, cl); + struct cached_dev *dc = container_of(s->d, + struct cached_dev, disk); + unsigned long flags; + + if (!list_empty(&s->inflight_node)) { + spin_lock_irqsave(&dc->inflight_lock, flags); + list_del(&s->inflight_node); + spin_unlock_irqrestore(&dc->inflight_lock, flags); + } atomic_dec(&s->iop.c->search_inflight); @@ -735,6 +795,10 @@ static inline struct search *search_alloc(struct bio *bio, /* Count on the bcache device */ s->orig_bdev = orig_bdev; s->start_time = start_time; + INIT_LIST_HEAD(&s->inflight_node); + s->req_start = bio->bi_iter.bi_sector; + s->req_end = s->req_start + bio_sectors(bio); + s->do_not_cache = 0; s->iop.c = d->c; s->iop.bio = NULL; s->iop.inode = d->id; @@ -850,7 +914,7 @@ static void cached_dev_read_done(struct closure *cl) closure_get(&dc->disk.cl); bio_complete(s); - if (s->iop.bio && + if (s->iop.bio && s->do_not_cache == 0 && !test_bit(CACHE_SET_STOPPING, &s->iop.c->flags)) { BUG_ON(!s->iop.replace); closure_call(&s->iop.cl, bch_data_insert, NULL, cl); @@ -886,6 +950,12 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, s->cache_missed = 1; + if (s->do_not_cache == 0) { + if (check_inflight_overlapping(s)) { + s->do_not_cache = 1; + } + } + if (s->cache_miss || s->iop.bypass) { miss = bio_next_split(bio, sectors, GFP_NOIO, &s->d->bio_split); ret = miss == bio ? MAP_DONE : MAP_CONTINUE; @@ -981,6 +1051,13 @@ static void cached_dev_write(struct cached_dev *dc, struct search *s) bch_keybuf_check_overlapping(&s->iop.c->moving_gc_keys, &start, &end); + if (s->iop.bypass == true) { + if (check_inflight_overlapping(s)) { + /* convert bypass write into writethrough write */ + s->iop.bypass = false; + } + } + down_read_non_owner(&dc->writeback_lock); if (bch_keybuf_check_overlapping(&dc->writeback_keys, &start, &end)) { /* diff --git a/drivers/md/bcache/stats.c b/drivers/md/bcache/stats.c index 503aafe188dc..31dbf69c80d6 100644 --- a/drivers/md/bcache/stats.c +++ b/drivers/md/bcache/stats.c @@ -49,6 +49,7 @@ read_attribute(cache_hit_ratio); read_attribute(cache_readaheads); read_attribute(cache_miss_collisions); read_attribute(bypassed); +read_attribute(cache_bypass_races); SHOW(bch_stats) { @@ -66,6 +67,7 @@ SHOW(bch_stats) var_print(cache_readaheads); var_print(cache_miss_collisions); + var_print(cache_bypass_races); sysfs_hprint(bypassed, var(sectors_bypassed) << 9); #undef var return 0; @@ -89,6 +91,7 @@ static struct attribute *bch_stats_files[] = { &sysfs_cache_readaheads, &sysfs_cache_miss_collisions, &sysfs_bypassed, + &sysfs_cache_bypass_races, NULL }; static KTYPE(bch_stats); @@ -116,6 +119,7 @@ void bch_cache_accounting_clear(struct cache_accounting *acc) acc->total.cache_readaheads = 0; acc->total.cache_miss_collisions = 0; acc->total.sectors_bypassed = 0; + acc->total.cache_bypass_races = 0; } void bch_cache_accounting_destroy(struct cache_accounting *acc) @@ -148,6 +152,7 @@ static void scale_stats(struct cache_stats *stats, unsigned long rescale_at) scale_stat(&stats->cache_readaheads); scale_stat(&stats->cache_miss_collisions); scale_stat(&stats->sectors_bypassed); + scale_stat(&stats->cache_bypass_races); } } @@ -171,6 +176,7 @@ static void scale_accounting(struct timer_list *t) move_stat(cache_readaheads); move_stat(cache_miss_collisions); move_stat(sectors_bypassed); + move_stat(cache_bypass_races); scale_stats(&acc->total, 0); scale_stats(&acc->day, DAY_RESCALE); @@ -232,6 +238,14 @@ void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, atomic_add(sectors, &c->accounting.collector.sectors_bypassed); } +void bch_mark_cache_bypass_race(struct cache_set *c, struct bcache_device *d) +{ + struct cached_dev *dc = container_of(d, struct cached_dev, disk); + + atomic_inc(&dc->accounting.collector.cache_bypass_races); + atomic_inc(&c->accounting.collector.cache_bypass_races); +} + void bch_cache_accounting_init(struct cache_accounting *acc, struct closure *parent) { diff --git a/drivers/md/bcache/stats.h b/drivers/md/bcache/stats.h index abfaabf7e7fc..4cd2dcea068b 100644 --- a/drivers/md/bcache/stats.h +++ b/drivers/md/bcache/stats.h @@ -10,6 +10,7 @@ struct cache_stat_collector { atomic_t cache_readaheads; atomic_t cache_miss_collisions; atomic_t sectors_bypassed; + atomic_t cache_bypass_races; }; struct cache_stats { @@ -22,6 +23,7 @@ struct cache_stats { unsigned long cache_readaheads; unsigned long cache_miss_collisions; unsigned long sectors_bypassed; + unsigned long cache_bypass_races; unsigned int rescale; }; @@ -61,5 +63,7 @@ void bch_mark_cache_miss_collision(struct cache_set *c, void bch_mark_sectors_bypassed(struct cache_set *c, struct cached_dev *dc, int sectors); +void bch_mark_cache_bypass_race(struct cache_set *c, + struct bcache_device *d); #endif /* _BCACHE_STATS_H_ */ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 03e1fe4de53d..eb670aa26796 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1495,6 +1495,10 @@ static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk, goto err; } + spin_lock_init(&dc->inflight_lock); + INIT_LIST_HEAD(&dc->inflight_read_list); + INIT_LIST_HEAD(&dc->inflight_bypass_write_list); + return 0; err: pr_notice("error %s: %s\n", dc->backing_dev_name, err); -- 2.20.1