On 01/14/2018 03:42 PM, Coly Li wrote: > If a bcache device is configured to writeback mode, current code does not > handle write I/O errors on backing devices properly. > > In writeback mode, write request is written to cache device, and > latter being flushed to backing device. If I/O failed when writing from > cache device to the backing device, bcache code just ignores the error and > upper layer code is NOT noticed that the backing device is broken. > > This patch tries to handle backing device failure like how the cache device > failure is handled, > - Add a error counter 'io_errors' and error limit 'error_limit' in struct > cached_dev. Add another io_disable to struct cached_dev to disable I/Os > on the problematic backing device. > - When I/O error happens on backing device, increase io_errors counter. And > if io_errors reaches error_limit, set cache_dev->io_disable to true, and > stop the bcache device. > > The result is, if backing device is broken of disconnected, and I/O errors > reach its error limit, backing device will be disabled and the associated > bcache device will be removed from system. > > Signed-off-by: Coly Li <colyli@xxxxxxx> > Cc: Michael Lyle <mlyle@xxxxxxxx> > Cc: Hannes Reinecke <hare@xxxxxxxx> > Cc: Junhui Tang <tang.junhui@xxxxxxxxxx> > --- > drivers/md/bcache/bcache.h | 7 +++++++ > drivers/md/bcache/io.c | 14 ++++++++++++++ > drivers/md/bcache/request.c | 14 ++++++++++++-- > drivers/md/bcache/super.c | 22 ++++++++++++++++++++++ > drivers/md/bcache/sysfs.c | 15 ++++++++++++++- > 5 files changed, 69 insertions(+), 3 deletions(-) > > diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h > index c41736960045..5a811959392d 100644 > --- a/drivers/md/bcache/bcache.h > +++ b/drivers/md/bcache/bcache.h > @@ -360,6 +360,7 @@ struct cached_dev { > unsigned sequential_cutoff; > unsigned readahead; > > + unsigned io_disable:1; > unsigned verify:1; > unsigned bypass_torture_test:1; > > @@ -379,6 +380,10 @@ struct cached_dev { > unsigned writeback_rate_i_term_inverse; > unsigned writeback_rate_p_term_inverse; > unsigned writeback_rate_minimum; > + > +#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64 > + atomic_t io_errors; > + unsigned error_limit; > }; > > enum alloc_reserve { > @@ -882,6 +887,7 @@ static inline void closure_bio_submit(struct cache_set *c, > > /* Forward declarations */ > > +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio); > void bch_count_io_errors(struct cache *, blk_status_t, int, const char *); > void bch_bbio_count_io_errors(struct cache_set *, struct bio *, > blk_status_t, const char *); > @@ -909,6 +915,7 @@ int bch_bucket_alloc_set(struct cache_set *, unsigned, > struct bkey *, int, bool); > bool bch_alloc_sectors(struct cache_set *, struct bkey *, unsigned, > unsigned, unsigned, bool); > +bool bch_cached_dev_error(struct cached_dev *dc); > > __printf(2, 3) > bool bch_cache_set_error(struct cache_set *, const char *, ...); > diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c > index 8013ecbcdbda..7fac97ae036e 100644 > --- a/drivers/md/bcache/io.c > +++ b/drivers/md/bcache/io.c > @@ -50,6 +50,20 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c, > } > > /* IO errors */ > +void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio) > +{ > + char buf[BDEVNAME_SIZE]; > + unsigned errors; > + > + WARN_ONCE(!dc, "NULL pointer of struct cached_dev"); > + > + errors = atomic_add_return(1, &dc->io_errors); > + if (errors < dc->error_limit) > + pr_err("%s: IO error on backing device, unrecoverable", > + bio_devname(bio, buf)); > + else > + bch_cached_dev_error(dc); > +} > > void bch_count_io_errors(struct cache *ca, > blk_status_t error, > diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c > index ad4cf71f7eab..386b388ce296 100644 > --- a/drivers/md/bcache/request.c > +++ b/drivers/md/bcache/request.c > @@ -637,6 +637,8 @@ static void backing_request_endio(struct bio *bio) > > if (bio->bi_status) { > struct search *s = container_of(cl, struct search, cl); > + struct cached_dev *dc = container_of(s->d, > + struct cached_dev, disk); > /* > * If a bio has REQ_PREFLUSH for writeback mode, it is > * speically assembled in cached_dev_write() for a non-zero > @@ -657,6 +659,7 @@ static void backing_request_endio(struct bio *bio) > } > s->recoverable = false; > /* should count I/O error for backing device here */ > + bch_count_backing_io_errors(dc, bio); > } > > bio_put(bio); > @@ -1067,8 +1070,14 @@ static void detatched_dev_end_io(struct bio *bio) > bio_data_dir(bio), > &ddip->d->disk->part0, ddip->start_time); > > - kfree(ddip); > + if (bio->bi_status) { > + struct cached_dev *dc = container_of(ddip->d, > + struct cached_dev, disk); > + /* should count I/O error for backing device here */ > + bch_count_backing_io_errors(dc, bio); > + } > > + kfree(ddip); > bio->bi_end_io(bio); > } > > @@ -1107,7 +1116,8 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q, > struct cached_dev *dc = container_of(d, struct cached_dev, disk); > int rw = bio_data_dir(bio); > > - if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { > + if (unlikely((d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags)) || > + dc->io_disable)) { > bio->bi_status = BLK_STS_IOERR; > bio_endio(bio); > return BLK_QC_T_NONE; > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c > index 08a0b541a4da..14fce3623770 100644 > --- a/drivers/md/bcache/super.c > +++ b/drivers/md/bcache/super.c > @@ -1188,6 +1188,10 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) > max(dc->disk.disk->queue->backing_dev_info->ra_pages, > q->backing_dev_info->ra_pages); > > + atomic_set(&dc->io_errors, 0); > + dc->io_disable = false; > + dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; > + > bch_cached_dev_request_init(dc); > bch_cached_dev_writeback_init(dc); > return 0; > @@ -1339,6 +1343,24 @@ int bch_flash_dev_create(struct cache_set *c, uint64_t size) > return flash_dev_run(c, u); > } > > +bool bch_cached_dev_error(struct cached_dev *dc) > +{ > + char name[BDEVNAME_SIZE]; > + > + if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags)) > + return false; > + > + dc->io_disable = true; > + /* make others know io_disable is true earlier */ > + smp_mb(); > + > + pr_err("bcache: stop %s: too many IO errors on backing device %s\n", > + dc->disk.name, bdevname(dc->bdev, name)); > + > + bcache_device_stop(&dc->disk); > + return true; > +} > + > /* Cache set */ > > __printf(2, 3) > diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c > index afb051bcfca1..7288927f2a47 100644 > --- a/drivers/md/bcache/sysfs.c > +++ b/drivers/md/bcache/sysfs.c > @@ -131,7 +131,9 @@ SHOW(__bch_cached_dev) > var_print(writeback_delay); > var_print(writeback_percent); > sysfs_hprint(writeback_rate, dc->writeback_rate.rate << 9); > - > + sysfs_hprint(io_errors, atomic_read(&dc->io_errors)); > + sysfs_printf(io_error_limit, "%i", dc->error_limit); > + sysfs_printf(io_disable, "%i", dc->io_disable); > var_print(writeback_rate_update_seconds); > var_print(writeback_rate_i_term_inverse); > var_print(writeback_rate_p_term_inverse); > @@ -223,6 +225,14 @@ STORE(__cached_dev) > d_strtoul(writeback_rate_i_term_inverse); > d_strtoul_nonzero(writeback_rate_p_term_inverse); > > + sysfs_strtoul_clamp(io_error_limit, dc->error_limit, 0, INT_MAX); > + > + if (attr == &sysfs_io_disable) { > + int v = strtoul_or_return(buf); > + > + dc->io_disable = v ? 1 : 0; > + } > + > d_strtoi_h(sequential_cutoff); > d_strtoi_h(readahead); > > @@ -330,6 +340,9 @@ static struct attribute *bch_cached_dev_files[] = { > &sysfs_writeback_rate_i_term_inverse, > &sysfs_writeback_rate_p_term_inverse, > &sysfs_writeback_rate_debug, > + &sysfs_errors, > + &sysfs_io_error_limit, > + &sysfs_io_disable, > &sysfs_dirty_data, > &sysfs_stripe_size, > &sysfs_partial_stripes_expensive, > Personally, I'm not a big fan of using smp_mb() and not proper locking. But in this case it should be okay. Reviewed-by: Hannes Reinecke <hare@xxxxxxxx> Cheers, Hannes -- Dr. Hannes Reinecke Teamlead Storage & Networking hare@xxxxxxx +49 911 74053 688 SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton HRB 21284 (AG Nürnberg) -- To unsubscribe from this list: send the line "unsubscribe linux-bcache" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html