On 2018/5/6 3:28 AM, Eric Wheeler wrote: > On Sun, 6 May 2018, Coly Li wrote: > >> On 2018/4/27 4:32 AM, Eric Wheeler wrote: >>>> Hi Coly, >>>> >>>> I'm sure you've been busy with the v4.17 merge but I thought I >>>> would check in: >>>> >>>> Have you had a chance to look at this? It is an opportunity to fix this >>>> 4k bug for the future since we can still reproduce the error. >>>> >>>>>>> bcache: bch_count_io_errors() dm-6: IO error on reading from cache, recovering <<< >>>> >>>> What do you think, is there data corruption exposure here since 4.1.49 >>>> still has the dirty-cache-recorvery bug? >>> >>> I just noticed that "bcache: only permit to recovery read error when cache >>> device is clean" is in v4.1.49, but would this recover gracefully in the >>> 4k error situation? >>> >>>> Also, would your failure-recovery patch series address this type of >>>> failure? >> >> Hi Eric, >> >> I just find a time slot to compose a patch checking 4K alignment of I/Os >> to backing device. After testing and first glance at the messages, I >> will post it out. >> > > Thank you Coly, I appreciate your help! Hi Eric, Please check the attached patch, it checks bcache key offset, if the offset is not 4KB aligned, a call trace will be printed out. I also run it with my own hardware, here is some information I may share. It seems bcache just tries to cache bio with any offset, no 4K alignment required. When I use fio with directIO, non-4K aligned bio can be sent into bcache code and it is just cached. I can see numerous call trace when I use directIO with 512B/1KB/2KB block size. But if I use 4KB block size in fio, or set block alignment to 4KB, no warning call trace printed, not at all. Then when I set block alignment to 2K in fio, even blocksize is 4KB, I can see non-4k-aligned warning. Therefore I guess the most probably reason is, the upper layer code sends non-4k-aligned bio into bcache code. I also tried to set bcache block size to 4K with make-bcache -w, when fio blocksize >= 4KB, no non-4k-aligned warning. But fio does not work if its blocksize < bcache block size, I am not sure whether setting bcache block size to 4K works to your situation. Just for your information. Coly Li
From 4fca0f5fa2504c85d397590b9429e30bcbd253db Mon Sep 17 00:00:00 2001 From: Coly Li <colyli@xxxxxxx> Date: Sun, 6 May 2018 00:05:21 +0800 Subject: [PATCH] bcache: check 4K alignment for KEY_OFFSET() and KEY_START() If KEY_OFFSET() or KEY_START() of a key is not 4KB aligned, dump call trace. The information may provide some clue why some special I/Os to backing device are not 4KB aligned. Signed-off-by: Coly Li <colyli@xxxxxxx> --- drivers/md/bcache/alloc.c | 3 +++ drivers/md/bcache/bcache.h | 8 ++++++++ drivers/md/bcache/bset.c | 6 ++++-- drivers/md/bcache/btree.c | 9 ++++++--- drivers/md/bcache/extents.c | 14 ++++++++++++++ drivers/md/bcache/movinggc.c | 2 ++ drivers/md/bcache/request.c | 2 ++ drivers/md/bcache/writeback.c | 10 ++++++++-- 8 files changed, 47 insertions(+), 7 deletions(-) diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c index 8eeab72b93e2..b16f3b46ec62 100644 --- a/drivers/md/bcache/alloc.c +++ b/drivers/md/bcache/alloc.c @@ -616,6 +616,9 @@ bool bch_alloc_sectors(struct cache_set *c, struct bkey *k, unsigned sectors, sectors = min(sectors, b->sectors_free); + check_4k_alignment(KEY_OFFSET(k), "KEY_OFFSET(k)"); + check_4k_alignment(KEY_OFFSET(k) + sectors, "KEY_OFFSET(k) + sectors"); + SET_KEY_OFFSET(k, KEY_OFFSET(k) + sectors); SET_KEY_SIZE(k, sectors); SET_KEY_PTRS(k, KEY_PTRS(&b->key)); diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index 6b420a55c745..c9ffef3cddd0 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -847,6 +847,14 @@ static inline void wake_up_allocators(struct cache_set *c) wake_up_process(ca->alloc_thread); } +static inline void check_4k_alignment(size_t sectors, char *str) +{ + if (unlikely(sectors & 7)) { + pr_err("%s is not 4KB aligned: %lu", str, sectors); + dump_stack(); + } +} + /* Forward declarations */ void bch_count_io_errors(struct cache *, int, const char *); diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c index 646fe85261c1..e6f9b9579d05 100644 --- a/drivers/md/bcache/bset.c +++ b/drivers/md/bcache/bset.c @@ -9,6 +9,7 @@ #include "util.h" #include "bset.h" +#include "bcache.h" #include <linux/console.h> #include <linux/random.h> @@ -219,9 +220,10 @@ bool __bch_cut_back(const struct bkey *where, struct bkey *k) BUG_ON(KEY_INODE(where) != KEY_INODE(k)); - if (bkey_cmp(where, &START_KEY(k)) > 0) + if (bkey_cmp(where, &START_KEY(k)) > 0) { + check_4k_alignment(KEY_START(k), "KEY_START(k)"); len = KEY_OFFSET(where) - KEY_START(k); - + } bkey_copy_key(k, where); BUG_ON(len > KEY_SIZE(k)); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 22b9e34ceb75..246fa3a5a71f 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1931,7 +1931,6 @@ static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, if (bkey_cmp(k, &b->key) <= 0) { if (!b->level) bkey_put(b->c, k); - ret |= btree_insert_key(b, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { @@ -2320,10 +2319,11 @@ static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, return ret; } - if (!b->level && (flags & MAP_END_KEY)) + if (!b->level && (flags & MAP_END_KEY)) { + check_4k_alignment(KEY_OFFSET(&b->key), "KEY_OFFSET(&b->key)"); ret = fn(op, b, &KEY(KEY_INODE(&b->key), KEY_OFFSET(&b->key), 0)); - + } return ret; } @@ -2420,6 +2420,9 @@ void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, bch_btree_map_keys(&refill.op, c, &buf->last_scanned, refill_keybuf_fn, MAP_END_KEY); + check_4k_alignment(KEY_OFFSET(&start), "KEY_OFFSET(&start)"); + check_4k_alignment(KEY_OFFSET(&buf->last_scanned), "KEY_OFFSET(&buf->last_scanned)"); + trace_bcache_keyscan(refill.nr_found, KEY_INODE(&start), KEY_OFFSET(&start), KEY_INODE(&buf->last_scanned), diff --git a/drivers/md/bcache/extents.c b/drivers/md/bcache/extents.c index 243de0bf15cd..3df03f401765 100644 --- a/drivers/md/bcache/extents.c +++ b/drivers/md/bcache/extents.c @@ -357,6 +357,11 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, * operations. */ + check_4k_alignment(KEY_START(k), "KEY_START(k)"); + check_4k_alignment(KEY_START(insert), "KEY_START(insert)"); + if (replace_key) + check_4k_alignment(KEY_START(replace_key), "KEY_START(replace_key)"); + if (replace_key && KEY_SIZE(k)) { /* * k might have been split since we inserted/found the @@ -366,6 +371,7 @@ static bool bch_extent_insert_fixup(struct btree_keys *b, uint64_t offset = KEY_START(k) - KEY_START(replace_key); + /* But it must be a subset of the replace key */ if (KEY_START(k) < KEY_START(replace_key) || KEY_OFFSET(k) > KEY_OFFSET(replace_key)) @@ -460,6 +466,13 @@ check_failed: if (!sectors_found) { return true; } else if (sectors_found < KEY_SIZE(insert)) { + check_4k_alignment(KEY_OFFSET(insert), "KEY_OFFSET(insert)"); + check_4k_alignment((KEY_SIZE(insert) - sectors_found), + "(KEY_SIZE(insert) - sectors_found)"); + check_4k_alignment(KEY_OFFSET(insert) - + (KEY_SIZE(insert) - sectors_found), + "KEY_OFFSET(insert) - (KEY_SIZE(insert) - sectors_found)"); + SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - (KEY_SIZE(insert) - sectors_found)); SET_KEY_SIZE(insert, sectors_found); @@ -606,6 +619,7 @@ static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey SET_KEY_CSUM(l, 0); } + check_4k_alignment(KEY_OFFSET(l) + KEY_SIZE(r), "KEY_OFFSET(l) + KEY_SIZE(r)"); SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r)); SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r)); diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index b929fc944e9c..05b8132b42b3 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -100,6 +100,8 @@ static void write_moving(struct closure *cl) if (!op->error) { moving_init(io); + check_4k_alignment(KEY_START(&io->w->key), + "KEY_START(&io->w->key)"); io->bio.bio.bi_iter.bi_sector = KEY_START(&io->w->key); op->write_prio = 1; op->bio = &io->bio.bio; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 25fa8445bb24..370668976c59 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -515,6 +515,8 @@ static int cache_lookup_fn(struct btree_op *op, struct btree *b, struct bkey *k) if (bkey_cmp(k, &KEY(s->iop.inode, bio->bi_iter.bi_sector, 0)) <= 0) return MAP_CONTINUE; + check_4k_alignment(KEY_START(k), "KEY_START(k)"); + if (KEY_INODE(k) != s->iop.inode || KEY_START(k) > bio->bi_iter.bi_sector) { unsigned bio_sectors = bio_sectors(bio); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b9346cd9cda1..ce93357e4c54 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -184,6 +184,7 @@ static void write_dirty(struct closure *cl) dirty_init(w); io->bio.bi_rw = WRITE; + check_4k_alignment(KEY_START(&w->key), "KEY_START(&w->key)"); io->bio.bi_iter.bi_sector = KEY_START(&w->key); io->bio.bi_bdev = io->dc->bdev; io->bio.bi_end_io = dirty_endio; @@ -236,11 +237,13 @@ static void read_dirty(struct cached_dev *dc) BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); + check_4k_alignment(KEY_START(&w->key), "KEY_START(&w->key)"); if (KEY_START(&w->key) != dc->last_read || jiffies_to_msecs(delay) > 50) while (!kthread_should_stop() && delay) delay = schedule_timeout_interruptible(delay); + check_4k_alignment(KEY_OFFSET(&w->key), "KEY_OFFSET(&w->key)"); dc->last_read = KEY_OFFSET(&w->key); io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) @@ -336,6 +339,8 @@ static void refill_full_stripes(struct cached_dev *dc) unsigned start_stripe, stripe, next_stripe; bool wrapped = false; + check_4k_alignment(KEY_OFFSET(&buf->last_scanned), + "KEY_OFFSET(&buf->last_scanned)"); stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); if (stripe >= dc->disk.nr_stripes) @@ -481,10 +486,11 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, if (KEY_INODE(k) > op->inode) return MAP_DONE; - if (KEY_DIRTY(k)) + if (KEY_DIRTY(k)) { + check_4k_alignment(KEY_START(k), "KEY_START(k)"); bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), KEY_START(k), KEY_SIZE(k)); - + } return MAP_CONTINUE; } -- 2.16.3