On Mon, Nov 12, 2018 at 10:20:28AM +0100, Florian-Ewald Müller wrote: > Hello Neil, > > Thank you very much for your reply! > > Yes indeed any incoming IO passing through md_handle_request() would > (temporarily) prohibit the md_suspend() of making progress. > We must admit that we observed the stale pointer in 'bitmap' in the context > of our raid1 IO serialization patch (see attached file). > Nevertheless we think that such a small, harmless change could make the > code generally more robust. Thanks Neil, I removed this patch. Help for a out-of-tree patch doesn't justify this patch. Thanks, Shaohua > > Best regards, > > On Mon, Nov 12, 2018 at 4:42 AM, NeilBrown <neilb@xxxxxxxx> wrote: > > > On Tue, Nov 06 2018, Jack Wang wrote: > > > > > From: Florian-Ewald Mueller <florian-ewald.mueller@xxxxxxxxxxxxxxx> > > > > > > - the bitmap only use in the first_clone clause for raid1_write_request. > > > - possible we have stale data in bitmap pointer. > > > > > > Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@cloud. > > ionos.com> > > > Signed-off-by: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxx> > > > --- > > > drivers/md/raid1.c | 11 ++++++++--- > > > 1 file changed, 8 insertions(+), 3 deletions(-) > > > > > > diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c > > > index 1d54109071cc..727cd5759c83 100644 > > > --- a/drivers/md/raid1.c > > > +++ b/drivers/md/raid1.c > > > @@ -1191,7 +1191,7 @@ static void raid1_read_request(struct mddev > > *mddev, struct bio *bio, > > > struct r1conf *conf = mddev->private; > > > struct raid1_info *mirror; > > > struct bio *read_bio; > > > - struct bitmap *bitmap = mddev->bitmap; > > > + struct bitmap *bitmap; > > > const int op = bio_op(bio); > > > const unsigned long do_sync = (bio->bi_opf & REQ_SYNC); > > > int max_sectors; > > > @@ -1254,7 +1254,9 @@ static void raid1_read_request(struct mddev > > *mddev, struct bio *bio, > > > mdname(mddev), > > > (unsigned long long)r1_bio->sector, > > > bdevname(mirror->rdev->bdev, b)); > > > - > > > + /* get mddev->bitmap behind wait_barrier() > > > + * as it could have been removed before */ > > > + bitmap = mddev->bitmap; > > > > This is unnecessary. mddev_suspend() is called before mddev->bitmap is > > changed, and mddev_suspend() cannot be called at this point as > > md_handle_request() has incremented ->active_io for us. > > > > > > > if (test_bit(WriteMostly, &mirror->rdev->flags) && > > > bitmap) { > > > /* > > > @@ -1305,7 +1307,6 @@ static void raid1_write_request(struct mddev > > *mddev, struct bio *bio, > > > struct r1conf *conf = mddev->private; > > > struct r1bio *r1_bio; > > > int i, disks; > > > - struct bitmap *bitmap = mddev->bitmap; > > > unsigned long flags; > > > struct md_rdev *blocked_rdev; > > > struct blk_plug_cb *cb; > > > @@ -1459,6 +1460,10 @@ static void raid1_write_request(struct mddev > > *mddev, struct bio *bio, > > > > > > > > > if (first_clone) { > > > + /* get mddev->bitmap behind wait_barrier() > > > + * as it could have been removed before */ > > > + struct bitmap *bitmap = mddev->bitmap; > > > + > > > > Again, not necessary. > > The code change isn't harmful, but the comment is as it is wrong. > > > > NeilBrown > > > > > > > /* do behind I/O ? > > > * Not if there are too many, or cannot > > > * allocate memory, or a reader on WriteMostly > > > -- > > > 2.7.4 > > > diff -Naur linux-4.14.79/drivers/md/Kconfig linux-4.14.79-ioser/drivers/md/Kconfig > --- linux-4.14.79/drivers/md/Kconfig 2018-11-04 14:52:51.000000000 +0100 > +++ linux-4.14.79-ioser/drivers/md/Kconfig 2018-11-06 17:15:00.809478897 +0100 > @@ -100,6 +100,16 @@ > > If unsure, say Y. > > +config MD_RAID1_IOSER > + tristate "RAID-1 I/O serialization per chunks" > + depends on MD_RAID1 > + default n > + ---help--- > + Serialize I/O per (configurable) chunks in order to avoid RAID-1 mirrors > + drifting apart with unsynchronized overlapping writes to the same chunk. > + > + If unsure, say N. > + > config MD_RAID10 > tristate "RAID-10 (mirrored striping) mode" > depends on BLK_DEV_MD > diff -Naur linux-4.14.79/drivers/md/raid1.c linux-4.14.79-ioser/drivers/md/raid1.c > --- linux-4.14.79/drivers/md/raid1.c 2018-11-04 14:52:51.000000000 +0100 > +++ linux-4.14.79-ioser/drivers/md/raid1.c 2018-11-06 17:13:49.740257771 +0100 > @@ -84,6 +84,10 @@ > > #include "raid1-10.c" > > +#ifdef CONFIG_MD_RAID1_IOSER > +#include "raid1-ioser.c" > +#endif > + > /* > * for resync bio, r1bio pointer can be retrieved from the per-bio > * 'struct resync_pages'. > @@ -1545,7 +1549,11 @@ > else { > if (!md_write_start(mddev,bio)) > return false; > +#ifdef CONFIG_MD_RAID1_IOSER > + raid1_ioser_write_request(mddev, bio, sectors); > +#else > raid1_write_request(mddev, bio, sectors); > +#endif > } > return true; > } > @@ -3016,10 +3024,19 @@ > if (!conf->thread) > goto abort; > > +#ifdef CONFIG_MD_RAID1_IOSER > + err = raid1_ioser_init(&conf->ioser); > + if (unlikely(err < 0)) > + goto abort; > +#endif > + > return conf; > > abort: > if (conf) { > +#ifdef CONFIG_MD_RAID1_IOSER > + raid1_ioser_exit(&conf->ioser); > +#endif > mempool_destroy(conf->r1bio_pool); > kfree(conf->mirrors); > safe_put_page(conf->tmppage); > @@ -3117,6 +3134,9 @@ > else > queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, > mddev->queue); > +#ifdef CONFIG_MD_RAID1_IOSER > + raid1_ioser_set_queue_limits(&conf->ioser); > +#endif > } > > ret = md_integrity_register(mddev); > @@ -3131,6 +3151,9 @@ > { > struct r1conf *conf = priv; > > +#ifdef CONFIG_MD_RAID1_IOSER > + raid1_ioser_exit(&conf->ioser); > +#endif > mempool_destroy(conf->r1bio_pool); > kfree(conf->mirrors); > safe_put_page(conf->tmppage); > diff -Naur linux-4.14.79/drivers/md/raid1.h linux-4.14.79-ioser/drivers/md/raid1.h > --- linux-4.14.79/drivers/md/raid1.h 2018-11-04 14:52:51.000000000 +0100 > +++ linux-4.14.79-ioser/drivers/md/raid1.h 2018-11-06 17:18:43.613299968 +0100 > @@ -53,6 +53,57 @@ > int raid_disks; > }; > > +#ifdef CONFIG_MD_RAID1_IOSER_MODULE > +#define CONFIG_MD_RAID1_IOSER > +#endif > + > +#ifdef CONFIG_MD_RAID1_IOSER > + > +struct r1wque { > + struct task_struct *task; > + struct llist_head list; > + wait_queue_head_t wait; > +}; > + > +struct r1ioser; > + > +struct r1chunk_data { > + struct r1ioser *ios; > + struct rb_node node; > + struct bio *bio; > + struct bio_list list; > + sector_t sect; > + unsigned long time; > + struct mutex lock; > + atomic_t rcnt; > + bool wrnd; > +}; > + > +struct r1chunk_addr { > + struct rb_root root; > + struct mutex lock; > +}; > + > +struct r1ioser_stat { > + unsigned long ovct; > +}; > + > +#define RAID1_IOSER_ADDR_BITS (7) > +#define RAID1_IOSER_ADDR_SIZE (1U << RAID1_IOSER_ADDR_BITS) > + > +struct r1ioser { > + struct r1chunk_addr addr[RAID1_IOSER_ADDR_SIZE]; > + mempool_t *memp; > + struct r1wque wque; > + struct delayed_work gcol; > + struct r1ioser_stat __percpu > + *stat; > + bool init; > + unsigned char bits; > +}; > + > +#endif > + > struct r1conf { > struct mddev *mddev; > struct raid1_info *mirrors; /* twice 'raid_disks' to > @@ -127,6 +178,9 @@ > sector_t cluster_sync_low; > sector_t cluster_sync_high; > > +#ifdef CONFIG_MD_RAID1_IOSER > + struct r1ioser ioser; > +#endif > }; > > /* > diff -Naur linux-4.14.79/drivers/md/raid1-ioser.c linux-4.14.79-ioser/drivers/md/raid1-ioser.c > --- linux-4.14.79/drivers/md/raid1-ioser.c 1970-01-01 01:00:00.000000000 +0100 > +++ linux-4.14.79-ioser/drivers/md/raid1-ioser.c 2018-11-09 21:00:45.038849286 +0100 > @@ -0,0 +1,615 @@ > +#ifdef CONFIG_MD_RAID1_IOSER > + > +#include <linux/kthread.h> > + > +/* ilog2 of serialization chunk size in sectors */ > +#define RAID1_IOSER_MIN_CHUNK_BITS (PAGE_SHIFT - 9) > +#define RAID1_IOSER_MAX_CHUNK_BITS 10 /* 512K */ > + > +#define RAID1_IOSER_DEF_CHUNK_BITS 7 /* 64K */ > + > +static int raid1_ioser_chunk_bits = RAID1_IOSER_DEF_CHUNK_BITS; > + > +#define RAID1_IOSER_BITS_TO_MASK(b) ((1UL << (b)) - 1UL) > +#define RAID1_IOSER_CHUNK_BASE(s, b) ((s) & ~RAID1_IOSER_BITS_TO_MASK(b)) > +#define RAID1_IOSER_CHUNK_OFFS(s, b) ((s) & RAID1_IOSER_BITS_TO_MASK(b)) > + > +#define per_cpu_get(p, f) ( \ > +{ \ > + unsigned int __c; \ > + typeof((p)->f) __r = 0; \ > + for_each_possible_cpu(__c) \ > + __r += per_cpu_ptr(p, __c)->f; \ > + __r; \ > +} ) > + > +#define per_cpu_inc(p, f) \ > +do { \ > + unsigned int __c = get_cpu(); \ > + per_cpu_ptr(p, __c)->f++; \ > + put_cpu(); \ > +} while (0) > + > +#define per_cpu_set(p, f, v) \ > +do { \ > + unsigned int __c; \ > + for_each_possible_cpu(__c) \ > + per_cpu_ptr(p, __c)->f = v; \ > +} while (0) > + > +static sector_t align_to_barrier_unit_end(sector_t, sector_t); > + > +static void raid1_write_request(struct mddev *, struct bio *, int); > + > +static void raid1_ioser_bio_endio(struct bio *bio) > +{ > + struct r1chunk_data *cdp = bio->bi_private; > + struct r1ioser *ios = cdp->ios; > + > + struct llist_node *lp; > + struct r1wque *qip; > + > + qip = &ios->wque; > + lp = (struct llist_node *)&bio->bi_next; > + if (llist_add(lp, &qip->list)) > + wake_up_interruptible(&qip->wait); > +} > + > +static inline void raid1_ioser_send_bio(struct mddev *mddev, struct bio *bio) > +{ > + sector_t sectors; > + > + sectors = align_to_barrier_unit_end( > + bio->bi_iter.bi_sector, bio_sectors(bio)); > + > + raid1_write_request(mddev, bio, sectors); > +} > + > +static inline bool raid1_ioser_send_and_unlock(struct mddev *mddev, > + struct r1chunk_data *cdp, struct bio *bio) > +{ > + struct bio *clone; > + bool ret = true; > + > + lockdep_assert_held(&cdp->lock); > + /* returns with lock dropped */ > + > + clone = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); > + if (unlikely(clone == NULL)) { > + pr_warn("%s: [%s] bio_clone failed\n", > + __func__, mdname(mddev)); > + ret = bio_list_empty(&cdp->list); > + } > + else { > + cdp->bio = bio; > + clone->bi_private = cdp; > + clone->bi_end_io = raid1_ioser_bio_endio; > + bio = clone; > + } > + cdp->time = jiffies; > + mutex_unlock(&cdp->lock); > + raid1_ioser_send_bio(mddev, bio); > + return ret; > +} > + > +static inline void raid1_ioser_handle_endio(struct bio *bio) > +{ > + struct r1chunk_data *cdp = bio->bi_private; > + blk_status_t status = bio->bi_status; > + > + bio_put(bio); > + mutex_lock(&cdp->lock); > + bio = cdp->bio; > + cdp->bio = NULL; > + bio->bi_status = status; > + bio_endio(bio); > +get_next: > + bio = bio_list_pop(&cdp->list); > + if (bio != NULL) { > + struct r1ioser *ios = cdp->ios; > + struct r1conf *conf = container_of(ios, struct r1conf, ioser); > + struct mddev *mddev = conf->mddev; > + > + bool done = raid1_ioser_send_and_unlock(mddev, cdp, bio); > + > + if (unlikely(!done)) { > + mutex_lock(&cdp->lock); > + goto get_next; > + } > + return; > + } > + mutex_unlock(&cdp->lock); > +} > + > +static inline void raid1_ioser_wque_cons(struct llist_node *lp) > +{ > + struct llist_node *np; > + struct bio *bio; > + > + lp = llist_reverse_order(lp); > + do { > + np = lp->next; > + lp->next = NULL; > + bio = container_of((void *)lp, struct bio, bi_next); > + raid1_ioser_handle_endio(bio); > + lp = np; > + } while (lp != NULL); > +} > + > +static int raid1_ioser_wque_worker(void *vp) > +{ > + struct r1wque *qip = vp; > + > + struct llist_node *lp; > + int rc; > + > + allow_signal(SIGKILL); > + if (signal_pending(current)) > + flush_signals(current); > + while (true) { > + rc = wait_event_interruptible(qip->wait, > + (lp = llist_del_all(&qip->list)) != NULL || > + kthread_should_stop() || kthread_should_park()); > + if (unlikely(kthread_should_stop() || kthread_should_park())) { > + if (unlikely(lp != NULL)) > + raid1_ioser_wque_cons(lp); > + if (kthread_should_stop()) > + break; > + kthread_parkme(); > + continue; > + } > + if (unlikely(rc < 0)) { > + flush_signals(current); > + if (likely(lp == NULL)) > + continue; > + } > + raid1_ioser_wque_cons(lp); > + } > + return 0; > +} > + > +static int __must_check raid1_ioser_init_wque(struct mddev *mddev, > + struct r1wque *qip) > +{ > + int err; > + > + BUG_ON(qip->task != NULL); > + > + memset(qip, 0, sizeof(*qip)); > + init_waitqueue_head(&qip->wait); > + qip->task = kthread_create(raid1_ioser_wque_worker, > + qip, "%s_wque", mdname(mddev)); > + if (unlikely(IS_ERR(qip->task))) { > + err = PTR_ERR(qip->task); > + qip->task = NULL; > + return err; > + } > + set_user_nice(qip->task, MIN_NICE); > + wake_up_process(qip->task); > + return 0; > +} > + > +static void raid1_ioser_exit_wque(struct mddev *mddev, struct r1wque *qip) > +{ > + struct llist_node *lp; > + struct llist_node *np; > + struct bio *bio; > + > + if (likely(qip->task != NULL)) { > + kthread_stop(qip->task); > + qip->task = NULL; > + } > + lp = llist_del_all(&qip->list); > + if (unlikely(lp != NULL)) { > + pr_err("%s: [%s] pending IO ?!\n", > + __func__, mdname(mddev)); > + lp = llist_reverse_order(lp); > + do { > + np = lp->next; > + lp->next = NULL; > + bio = container_of((void *)lp, struct bio, bi_next); > + bio_io_error(bio); > + lp = np; > + } while (lp != NULL); > + } > +} > + > +static inline int __must_check raid1_ioser_comp_sect(sector_t s1, sector_t s2) > +{ > + if (s1 < s2) > + return -1; > + if (s1 > s2) > + return 1; > + return 0; > +} > + > +static struct r1chunk_data *__must_check > +raid1_ioser_get_chunk_data(struct r1ioser *ios, sector_t sect) > +{ > + struct r1chunk_addr *cap; > + struct r1chunk_data *cdp; > + > + struct rb_root *rtp; > + struct rb_node **npp; > + struct rb_node *pnp = NULL; > + > + int idx, rc; > + > + idx = hash_long(sect, RAID1_IOSER_ADDR_BITS); > + cap = &ios->addr[idx]; > + > + mutex_lock(&cap->lock); > + rtp = &cap->root; > + npp = &rtp->rb_node; > + while (*npp != NULL) { > + pnp = *npp; > + cdp = rb_entry(*npp, struct r1chunk_data, node); > + rc = raid1_ioser_comp_sect(sect, cdp->sect); > + if (rc < 0) > + npp = &(*npp)->rb_left; > + else if (rc > 0) > + npp = &(*npp)->rb_right; > + else { > + atomic_inc(&cdp->rcnt); > + mutex_unlock(&cap->lock); > + return cdp; > + } > + } > + cdp = mempool_alloc(ios->memp, GFP_NOIO); > + cdp->ios = ios; > + cdp->bio = NULL; > + cdp->sect = sect; > + cdp->wrnd = false; > + mutex_init(&cdp->lock); > + bio_list_init(&cdp->list); > + atomic_set(&cdp->rcnt, 1); > + > + rb_link_node(&cdp->node, pnp, npp); > + rb_insert_color(&cdp->node, rtp); > + mutex_unlock(&cap->lock); > + return cdp; > +} > + > +static void raid1_ioser_write_request(struct mddev *mddev, > + struct bio *bio, sector_t sectors) > +{ > + struct r1conf *conf = mddev->private; > + struct r1ioser *ios = &conf->ioser; > + > + struct r1chunk_data *cdp; > + sector_t sect; > + > + if (!ios->init) { > + raid1_write_request(mddev, bio, sectors); > + return; > + } > + sect = bio->bi_iter.bi_sector; > + sect = RAID1_IOSER_CHUNK_BASE(sect, ios->bits); > + cdp = raid1_ioser_get_chunk_data(ios, sect); > + > + mutex_lock(&cdp->lock); > + atomic_dec(&cdp->rcnt); > + if (cdp->bio == NULL && bio_list_empty(&cdp->list)) > + raid1_ioser_send_and_unlock(mddev, cdp, bio); > + else { > + bio_list_add(&cdp->list, bio); > + per_cpu_inc(ios->stat, ovct); > + mutex_unlock(&cdp->lock); > + } > +} > + > +static inline void raid1_ioser_gcol_chunk(struct r1chunk_addr *cap, > + struct r1chunk_data *cdp) > +{ > + struct r1ioser *ios; > + struct r1conf *conf; > + struct mddev *mddev; > + > + unsigned long diff; > + > + if (!mutex_trylock(&cdp->lock)) > + return; > + if (atomic_read(&cdp->rcnt) > 0) { > + cdp->wrnd = false; > + mutex_unlock(&cdp->lock); > + return; > + } > + diff = (long)jiffies - (long)(cdp->time); > + if (diff <= (180 * HZ)) { > + cdp->wrnd = false; > + mutex_unlock(&cdp->lock); > + return; > + } > + ios = cdp->ios; > + if (cdp->bio != NULL || !bio_list_empty(&cdp->list)) { > + if (cdp->wrnd) { > + mutex_unlock(&cdp->lock); > + return; > + } > + conf = container_of(ios, struct r1conf, ioser); > + mddev = conf->mddev; > + pr_warn("%s: [%s] %s IO at %lu after %u ms ?!\n", > + __func__, mdname(mddev), > + (cdp->bio != NULL ? "inflight" : "pending"), > + cdp->sect, jiffies_to_msecs(diff)); > + cdp->wrnd = true; > + mutex_unlock(&cdp->lock); > + return; > + } > + mutex_unlock(&cdp->lock); > + mutex_destroy(&cdp->lock); > + rb_erase(&cdp->node, &cap->root); > + mempool_free(cdp, ios->memp); > +} > + > +static void raid1_ioser_gcol_worker(struct work_struct *wkp) > +{ > + struct delayed_work *dwp = to_delayed_work(wkp); > + struct r1ioser *ios = container_of(dwp, struct r1ioser, gcol); > + > + struct r1chunk_addr *cap; > + struct r1chunk_data *cdp; > + struct rb_node *ndp; > + > + int i; > + > + for (i = 0; i < ARRAY_SIZE(ios->addr); i++) { > + cap = &ios->addr[i]; > + if (!mutex_trylock(&cap->lock)) > + continue; > + ndp = rb_first(&cap->root); > + while (ndp != NULL) { > + cdp = rb_entry(ndp, struct r1chunk_data, node); > + ndp = rb_next(ndp); > + raid1_ioser_gcol_chunk(cap, cdp); > + } > + mutex_unlock(&cap->lock); > + cond_resched(); > + } > + if (likely(ios->init)) > + queue_delayed_work(system_long_wq, &ios->gcol, 90 * HZ); > +} > + > +static void raid1_ioser_set_queue_limits(struct r1ioser *ios) > +{ > + struct r1conf *conf = container_of(ios, struct r1conf, ioser); > + struct mddev *mddev = conf->mddev; > + > + unsigned char bits; > + unsigned int bcnt; > + unsigned int scnt; > + > + if (ios->bits == 0) > + return; > + BUG_ON(mddev->queue == NULL); > + bits = ilog2(queue_max_sectors(mddev->queue)); > + ios->bits = min_not_zero(bits, ios->bits); > + if (ios->bits < RAID1_IOSER_MIN_CHUNK_BITS) > + ios->bits = RAID1_IOSER_MIN_CHUNK_BITS; > + scnt = 1U << ios->bits; > + bcnt = 1U << (ios->bits + 9); > + blk_queue_chunk_sectors(mddev->queue, scnt); > + blk_queue_max_hw_sectors(mddev->queue, scnt); > + blk_queue_io_opt(mddev->queue, bcnt); > + if (blk_queue_discard(mddev->queue)) { > + blk_queue_max_discard_sectors(mddev->queue, scnt); > + mddev->queue->limits.discard_granularity = bcnt; > + mddev->queue->limits.discard_alignment = bcnt; > + blk_queue_max_discard_segments(mddev->queue, 1); > + } > + ios->init = true; > + pr_info("%s: [%s] set up with %u KiB chunks\n", > + __func__, mdname(mddev), 1U << (ios->bits - 1)); > + queue_delayed_work(system_long_wq, &ios->gcol, 90 * HZ); > +} > + > +static ssize_t md_raid1_ioser_chunk_size_show(struct mddev *mddev, char *page) > +{ > + struct r1conf *conf = mddev->private; > + struct r1ioser *ios = &conf->ioser; > + > + return sprintf(page, "%u\n", (ios->bits > 0 ? > + (1U << (ios->bits - 1)) : 0)); > +} > + > +static struct md_sysfs_entry md_raid1_ioser_chunk_size_kb = > +__ATTR(ioser_chunk_size_kb, S_IRUGO, md_raid1_ioser_chunk_size_show, NULL); > + > +static ssize_t md_raid1_ioser_overlap_count_show(struct mddev *mddev, char *page) > +{ > + struct r1conf *conf = mddev->private; > + struct r1ioser *ios = &conf->ioser; > + > + return sprintf(page, "%lu\n", per_cpu_get(ios->stat, ovct)); > +} > + > +static ssize_t md_raid1_ioser_overlap_count_store(struct mddev *mddev, > + const char *buf, size_t len) > +{ > + struct r1conf *conf = mddev->private; > + struct r1ioser *ios = &conf->ioser; > + long val; > + > + if (kstrtol(buf, 0, &val) != 0 || val != 0) > + return -EINVAL; > + > + per_cpu_set(ios->stat, ovct, 0); > + return len; > +} > + > +static struct md_sysfs_entry md_raid1_ioser_overlap_count = > +__ATTR(ioser_overlap_count, S_IRUGO|S_IWUSR, > + md_raid1_ioser_overlap_count_show, > + md_raid1_ioser_overlap_count_store); > + > +static struct attribute *md_raid1_ioser_attrs[] = { > + &md_raid1_ioser_chunk_size_kb.attr, > + &md_raid1_ioser_overlap_count.attr, > + NULL, > +}; > + > +static struct attribute_group md_raid1_ioser_group = { > + .name = NULL, > + .attrs = md_raid1_ioser_attrs, > +}; > + > +static int __must_check raid1_ioser_init(struct r1ioser *ios) > +{ > + struct r1conf *conf = container_of(ios, struct r1conf, ioser); > + struct mddev *mddev = conf->mddev; > + > + struct kmem_cache *kcp; > + char name[32]; > + int rc; > + > + ios->bits = raid1_ioser_chunk_bits; > + if (ios->bits == 0) { > + BUG_ON(ios->init); > + return 0; > + } > + for (rc = 0; rc < ARRAY_SIZE(ios->addr); rc++) { > + mutex_init(&ios->addr[rc].lock); > + ios->addr[rc].root = RB_ROOT; > + } > + INIT_DELAYED_WORK(&ios->gcol, raid1_ioser_gcol_worker); > + snprintf(name, sizeof(name), "%s-ioser", mdname(mddev)); > + name[sizeof(name) - 1] = '\0'; > + kcp = kmem_cache_create(name, sizeof(struct r1chunk_data), > + 0, SLAB_HWCACHE_ALIGN, NULL); > + if (unlikely(kcp == NULL)) { > + pr_err("%s: [%s] kmem_cache_create failed\n", > + __func__, mdname(mddev)); > + return -ENOMEM; > + } > + ios->memp = mempool_create_slab_pool(num_possible_cpus(), kcp); > + if (unlikely(ios->memp == NULL)) { > + pr_err("%s: [%s] mempool_create failed\n", > + __func__, mdname(mddev)); > + kmem_cache_destroy(kcp); > + return -ENOMEM; > + } > + ios->stat = alloc_percpu(struct r1ioser_stat); > + if (unlikely(ios->stat == NULL)) { > + pr_err("%s: [%s] alloc_percpu failed (%d)\n", > + __func__, mdname(mddev), rc); > + mempool_destroy(ios->memp); > + kmem_cache_destroy(kcp); > + ios->memp = NULL; > + return -ENOMEM; > + } > + rc = raid1_ioser_init_wque(mddev, &ios->wque); > + if (unlikely(rc < 0)) { > + pr_err("%s: [%s] init_wque failed (%d)\n", > + __func__, mdname(mddev), rc); > + mempool_destroy(ios->memp); > + kmem_cache_destroy(kcp); > + ios->memp = NULL; > + free_percpu(ios->stat); > + ios->stat = NULL; > + return rc; > + } > + if (likely(mddev->kobj.sd != NULL) && > + sysfs_create_group(&mddev->kobj, &md_raid1_ioser_group)) > + pr_warn("%s: [%s] cannot register extra attributes\n", > + __func__, mdname(mddev)); > + return 0; > +} > + > +static void raid1_ioser_exit(struct r1ioser *ios) > +{ > + struct r1conf *conf = container_of(ios, struct r1conf, ioser); > + struct mddev *mddev = conf->mddev; > + > + struct r1chunk_addr *cap; > + struct r1chunk_data *cdp; > + struct kmem_cache *kcp; > + struct rb_node *ndp; > + struct bio *bio; > + > + int i; > + > + if (ios->bits == 0) > + return; > + ios->init = false; > + if (likely(mddev->kobj.sd != NULL)) > + sysfs_remove_group(&mddev->kobj, &md_raid1_ioser_group); > + cancel_delayed_work_sync(&ios->gcol); > + raid1_ioser_exit_wque(mddev, &ios->wque); > + for (i = 0; i < ARRAY_SIZE(ios->addr); i++) { > + cap = &ios->addr[i]; > + mutex_lock(&cap->lock); > + ndp = rb_first(&cap->root); > + while (ndp != NULL) { > + cdp = rb_entry(ndp, struct r1chunk_data, node); > + ndp = rb_next(ndp); > + mutex_lock(&cdp->lock); > + WARN(cdp->bio != NULL, "%s: [%s] IO in flight\n", > + __func__, mdname(mddev)); > + bio = bio_list_pop(&cdp->list); > + while (unlikely(bio != NULL)) { > + bio_io_error(bio); > + bio = bio_list_pop(&cdp->list); > + } > + mutex_unlock(&cdp->lock); > + mutex_destroy(&cdp->lock); > + rb_erase(&cdp->node, &cap->root); > + mempool_free(cdp, ios->memp); > + } > + mutex_unlock(&cap->lock); > + mutex_destroy(&cap->lock); > + } > + if (likely(ios->memp != NULL)) { > + kcp = ios->memp->pool_data; > + mempool_destroy(ios->memp); > + kmem_cache_destroy(kcp); > + ios->memp = NULL; > + } > + if (likely(ios->stat != NULL)) { > + free_percpu(ios->stat); > + ios->stat = NULL; > + } > +} > + > +static int raid1_ioser_chunk_size_show(char *cp, const struct kernel_param *kp) > +{ > + return sprintf(cp, "%u\n", (raid1_ioser_chunk_bits > 0 ? > + (1U << (raid1_ioser_chunk_bits - 1)) : 0)); > +} > + > +static int raid1_ioser_chunk_size_store(const char *cp, > + const struct kernel_param *kp) > +{ > + size_t size; > + int bits; > + > + if (kstrtoul(cp, 0, &size) != 0) > + return -EINVAL; > + > + if (size == 0) { > + if (raid1_ioser_chunk_bits != 0) { > + pr_info("%s: IO serialization disabled\n", __func__); > + raid1_ioser_chunk_bits = 0; > + } > + return 0; > + } > + if (raid1_ioser_chunk_bits == 0) > + pr_info("%s: IO serialization enabled\n", __func__); > + > + bits = order_base_2(size) + 1; > + raid1_ioser_chunk_bits = clamp(bits, > + RAID1_IOSER_MIN_CHUNK_BITS, RAID1_IOSER_MAX_CHUNK_BITS); > + return 0; > +} > + > +static const struct kernel_param_ops raid1_ioser_chunk_size_ops = { > + .set = raid1_ioser_chunk_size_store, > + .get = raid1_ioser_chunk_size_show, > +}; > + > +module_param_cb(ioser_chunk_size_kb, > + &raid1_ioser_chunk_size_ops, > + NULL, S_IRUGO | S_IWUSR); > +MODULE_PARM_DESC(ioser_chunk_size_kb, > + " IO serialization chunk size in KiB (zero disables it)"); > + > +#endif