[PATCH] md/raid1: IO write serialization per chunks

Jack Wang <xjtuwjp@xxxxxxxxx> · Thu, 18 Oct 2018 16:31:37 +0200

From: Florian-Ewald Mueller <florian-ewald.mueller@xxxxxxxxxxxxxxxx>

Motivation:

Raid1 is supposed to simply replicate data to all of its configured slave
devices (legs). Data is expected to be written identically to all its legs,
thereby producing a "mirrored set" of drives.

Not synchronized, consecutive, overlapped writes can produce different
results though on raid1's legs. This happens easily due to the (write)
IO reordering on the way to the slave devices (possibly through many
stacked drivers).

One can argue that the IO issuer application (above raid1) should take care of
synchronizing the writes.  We (at Profitbricks) think that this should be the
concern of the application (or whatever layer) only up to the raid1 level.
This because that application above the raid1 sees it as a "disk" and doesn't
expect further (weird) artifacts from its functionality. With raid1 legs
containing different data and the read balance logic in raid1, consecutive
reads can deliver different results even on an (in between) unchanged device.

One elegant, nice md-raid implementation feature is the back and forth
convertibility between raid1 and raid5 with 2 legs. If the content of the 2
raid1 legs are allowed (by the SW) to drift apart so easily during normal
operation, the above conversion is lead to absurdity. Also, in practice,
building SW stacks with the actual md/raid1 implementation leads to annoying
and unnecessary difficulties when 1 leg is removed (e.g. for maintenance).

Workaround (at Profitbricks):

Obviously serializing and synchronizing all the write IO to a device leads to
an unacceptable performance degradation. The idea here is to divide the raid1
device in small chunks, allow IO only of maximal that chunk size and
synchronizing it per chunk. Write IO to different chunks can still happen in
"parallel", asynchronously, not serialized to each other, with almost unaltered
performance.

The ideal chunk size depends heavily of the IO pattern the application, using
raid1 in its SW stack, its issuing. This chunk size has also to be a good trade
off between the disadvantages of (bigger) IO splitting and the advantages
brought by performing "paralleled", asynchronously IO on multiple chunks.

We (at Profitbicks) constructed a "lazy" patch for the md/raid1 from Linux
mainline kernel aiming to solve this. This got a "lazy" implementation as our
goal was to solve the above described problem in a decent and timely efficient
manner.

Our patch tries to interfere as less as possible with the original md/raid1
implementation (also in order to avoid introducing regression bugs). By that
we constructed an encapsulated serialization layer per chunk, logically above
and separated from the original raid1 write IO processing,

We simply take care that the IO is divided in chunks of configurable sizes
(between 4K and 512K) before reaching the raid1 block device and serialize
the in-flight writes per chunk (imposing a write queue depth of 1 [per chunk]).

In order to make our patch compact, self contained and less intrusive:
1. incoming write IO is cloned in order to be able to get the acknowledgments
   of its completion, and
2. the completions are passed via a worker thread per instance to decouple the
   send and receive paths.
3. add chunk_size_kb and overlap_count sysfs entries per md (raid1) device for
oberservation.

Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@xxxxxxxxxxxxxxxx>
[jwang: reformat the commit message, testing, cleanup, rebase to 4.19-rc8]
Signed-off-by: Jack Wang <jinpu.wang@xxxxxxxxxxxxxxxx>
---
 drivers/md/Kconfig       |  10 +
 drivers/md/raid1-ioser.c | 673 +++++++++++++++++++++++++++++++++++++++++++++++
 drivers/md/raid1.c       |  36 +++
 drivers/md/raid1.h       |  82 ++++++
 4 files changed, 801 insertions(+)
 create mode 100644 drivers/md/raid1-ioser.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 8b8c123cae66..d961126443b9 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -100,6 +100,16 @@ config MD_RAID1
 
 	  If unsure, say Y.
 
+config MD_RAID1_IOSER
+	tristate "RAID-1 I/O serialization per chunks"
+	depends on MD_RAID1
+	default n
+	---help---
+	Serialize I/O per (configurable) chunks in order to avoid RAID-1 mirrors
+	drifting apart with unsynchronized overlapping writes to the same chunk.
+
+	If unsure, say N.
+
 config MD_RAID10
 	tristate "RAID-10 (mirrored striping) mode"
 	depends on BLK_DEV_MD
diff --git a/drivers/md/raid1-ioser.c b/drivers/md/raid1-ioser.c
new file mode 100644
index 000000000000..b4ce3040eb52
--- /dev/null
+++ b/drivers/md/raid1-ioser.c
@@ -0,0 +1,673 @@
+#ifdef CONFIG_MD_RAID1_IOSER
+
+#include <linux/kthread.h>
+#include <linux/sched/signal.h>
+
+/* ilog2 of serialization chunk size in sectors */
+#define RAID1_IOSER_MIN_CHUNK_BITS	(PAGE_SHIFT - 9)
+#define RAID1_IOSER_MAX_CHUNK_BITS	10	/* 512K */
+
+#define RAID1_IOSER_DEF_CHUNK_BITS	7	/*  64K */
+
+static int raid1_ioser_chunk_bits = RAID1_IOSER_DEF_CHUNK_BITS;
+
+#define RAID1_IOSER_BITS_TO_MASK(b)	((1UL << (b)) - 1UL)
+#define RAID1_IOSER_CHUNK_BASE(s, b)	((s) & ~RAID1_IOSER_BITS_TO_MASK(b))
+#define RAID1_IOSER_CHUNK_OFFS(s, b)	((s) & RAID1_IOSER_BITS_TO_MASK(b))
+
+static sector_t align_to_barrier_unit_end(sector_t, sector_t);
+
+static void raid1_write_request(struct mddev *, struct bio *, int);
+
+static inline void __raid1_ioser_wait_drain(struct r1ioser *ios)
+{
+	io_wait_event(ios->wait, atomic_read(&ios->ioct) <= 0);
+}
+
+static inline void raid1_ioser_dec_ioct(struct r1ioser *ios)
+{
+	if (atomic_dec_and_test(&ios->ioct))
+		wake_up_all(&ios->wait);
+}
+
+static inline void raid1_ioser_wait_drain(struct r1ioser *ios)
+{
+	if (ios->init)
+		__raid1_ioser_wait_drain(ios);
+}
+
+static inline void raid1_ioser_stop_and_drain(struct r1ioser *ios)
+{
+	if (ios->init) {
+		atomic_inc(&ios->stop);
+		__raid1_ioser_wait_drain(ios);
+	}
+}
+
+static inline void raid1_ioser_start(struct r1ioser *ios)
+{
+	if (ios->init) {
+		if (atomic_dec_and_test(&ios->stop))
+			wake_up_all(&ios->barr);
+	}
+}
+
+static void raid1_ioser_bio_endio(struct bio *bio)
+{
+	struct r1chunk_data *cdp = bio->bi_private;
+	struct r1ioser *ios = cdp->ios;
+
+	struct llist_node *lp;
+	struct r1wque *qip;
+
+	qip = &ios->wque;
+	lp = (struct llist_node *)&bio->bi_next;
+	if (llist_add(lp, &qip->list))
+		wake_up_interruptible(&qip->wait);
+}
+
+static inline void raid1_ioser_write_bio(struct mddev *mddev, struct bio *bio)
+{
+	sector_t sectors;
+
+	sectors = align_to_barrier_unit_end(
+		bio->bi_iter.bi_sector, bio_sectors(bio));
+
+	raid1_write_request(mddev, bio, sectors);
+}
+
+static inline bool raid1_ioser_write_and_unlock(struct mddev *mddev,
+	struct r1chunk_data *cdp, struct bio *bio)
+{
+	struct r1ioser *ios = cdp->ios;
+	struct bio *clone;
+	bool ret = true;
+
+	lockdep_assert_held(&cdp->lock);
+	/* returns with lock dropped */
+
+	clone = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
+	if (unlikely(clone == NULL)) {
+		pr_warn("%s: [%s] bio_clone failed\n",
+			__func__, mdname(mddev));
+		ret = bio_list_empty(&cdp->list);
+	}
+	else {
+		cdp->bio = bio;
+		clone->bi_private = cdp;
+		clone->bi_end_io = raid1_ioser_bio_endio;
+		bio = clone;
+	}
+	cdp->time = jiffies;
+	mutex_unlock(&cdp->lock);
+	raid1_ioser_write_bio(mddev, bio);
+	raid1_ioser_dec_ioct(ios);
+	return ret;
+}
+
+static inline void raid1_ioser_handle_endio(struct bio *bio)
+{
+	struct r1chunk_data *cdp = bio->bi_private;
+	blk_status_t status = bio->bi_status;
+
+	bio_put(bio);
+	mutex_lock(&cdp->lock);
+	bio = cdp->bio;
+	cdp->bio = NULL;
+	bio->bi_status = status;
+	bio_endio(bio);
+get_next:
+	bio = bio_list_pop(&cdp->list);
+	if (bio != NULL) {
+		struct r1ioser *ios = cdp->ios;
+		struct r1conf *conf = container_of(ios, struct r1conf, ioser);
+		struct mddev *mddev = conf->mddev;
+
+		bool done = raid1_ioser_write_and_unlock(mddev, cdp, bio);
+
+		if (unlikely(!done)) {
+			mutex_lock(&cdp->lock);
+			goto get_next;
+		}
+		return;
+	}
+	mutex_unlock(&cdp->lock);
+}
+
+static inline void raid1_ioser_wque_cons(struct llist_node *lp)
+{
+	struct llist_node *np;
+	struct bio *bio;
+
+	do {
+		np = lp->next;
+		lp->next = NULL;
+		bio = container_of((void *)lp, struct bio, bi_next);
+		raid1_ioser_handle_endio(bio);
+		lp = np;
+	} while (lp != NULL);
+}
+
+static int raid1_ioser_wque_worker(void *vp)
+{
+	struct r1wque *qip = vp;
+
+	struct llist_node *lp;
+	int rc;
+
+	allow_signal(SIGKILL);
+	if (signal_pending(current))
+		flush_signals(current);
+	while (true) {
+		rc = wait_event_interruptible(qip->wait,
+			(lp = llist_del_all(&qip->list)) != NULL ||
+			 kthread_should_stop() || kthread_should_park());
+		if (unlikely(kthread_should_stop() || kthread_should_park())) {
+			if (unlikely(lp != NULL)) {
+				lp = llist_reverse_order(lp);
+				raid1_ioser_wque_cons(lp);
+			}
+			if (kthread_should_stop())
+				break;
+			kthread_parkme();
+			continue;
+		}
+		if (unlikely(rc < 0)) {
+			flush_signals(current);
+			if (likely(lp == NULL))
+				continue;
+		}
+		lp = llist_reverse_order(lp);
+		raid1_ioser_wque_cons(lp);
+	}
+	return 0;
+}
+
+static int __must_check raid1_ioser_init_wque(struct mddev *mddev, struct r1wque *qip)
+{
+	int err;
+
+	BUG_ON(qip->task != NULL);
+
+	memset(qip, 0, sizeof(*qip));
+	init_waitqueue_head(&qip->wait);
+	qip->task = kthread_create(raid1_ioser_wque_worker,
+				qip, "%s_wque", mdname(mddev));
+	if (unlikely(IS_ERR(qip->task))) {
+		err = PTR_ERR(qip->task);
+		qip->task = NULL;
+		return err;
+	}
+	set_user_nice(qip->task, MIN_NICE);
+	wake_up_process(qip->task);
+	return 0;
+}
+
+static void raid1_ioser_exit_wque(struct mddev *mddev, struct r1wque *qip)
+{
+	struct llist_node *lp;
+	struct llist_node *np;
+	struct bio *bio;
+
+	if (likely(qip->task != NULL)) {
+		kthread_stop(qip->task);
+		qip->task = NULL;
+	}
+	lp = llist_del_all(&qip->list);
+	if (unlikely(lp != NULL)) {
+		pr_err("%s: [%s] pending IO ?!\n",
+			__func__, mdname(mddev));
+		lp = llist_reverse_order(lp);
+		do {
+			np = lp->next;
+			lp->next = NULL;
+			bio = container_of((void *)lp, struct bio, bi_next);
+			bio_io_error(bio);
+			lp = np;
+		} while (lp != NULL);
+	}
+}
+
+/*
+ * sector is aligned to chunk_size, we might have some false positive
+ * serialization, e.g. 8K IO at the beginning of a 64K
+ * chunk with 8K IO at the end of the same chunk, these IOs will be serialized.
+ * we accept that to reduce the complexity and overhead.
+ **/
+static inline int __must_check raid1_ioser_comp_sect(sector_t s1, sector_t s2)
+{
+	if (s1 < s2)
+		return -1;
+	if (s1 > s2)
+		return 1;
+	return 0;
+}
+
+static struct r1chunk_data *__must_check
+raid1_ioser_get_chunk_data(struct r1ioser *ios, sector_t sect)
+{
+	struct r1chunk_addr *cap;
+	struct r1chunk_data *cdp;
+
+	struct rb_root *rtp;
+	struct rb_node **npp;
+	struct rb_node *pnp = NULL;
+
+	int idx, rc;
+
+	idx = hash_long(sect, RAID1_IOSER_ADDR_BITS);
+	cap = &ios->addr[idx];
+	mutex_lock(&cap->lock);
+	rtp = &cap->root;
+	npp = &rtp->rb_node;
+	while (*npp != NULL) {
+		pnp = *npp;
+		cdp = rb_entry(*npp, struct r1chunk_data, node);
+		rc = raid1_ioser_comp_sect(sect, cdp->sect);
+		if (rc < 0)
+			npp = &(*npp)->rb_left;
+		else if (rc > 0)
+			npp = &(*npp)->rb_right;
+		else {
+			atomic_inc(&cdp->rcnt);
+			mutex_unlock(&cap->lock);
+			return cdp;
+		}
+	}
+	cdp = mempool_alloc(ios->memp, GFP_NOIO);
+	cdp->wrnd = false;
+	cdp->bio = NULL;
+	cdp->ios = ios;
+	cdp->sect = sect;
+	mutex_init(&cdp->lock);
+	bio_list_init(&cdp->list);
+	atomic_set(&cdp->rcnt, 1);
+	rb_link_node(&cdp->node, pnp, npp);
+	rb_insert_color(&cdp->node, rtp);
+	mutex_unlock(&cap->lock);
+	return cdp;
+}
+
+static inline void raid1_ioser_handle_bio(struct mddev *mddev, struct bio *bio)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1ioser *ios = &conf->ioser;
+
+	struct r1chunk_data *cdp;
+	sector_t sect = bio->bi_iter.bi_sector;
+
+	sect = RAID1_IOSER_CHUNK_BASE(sect, ios->bits);
+	cdp = raid1_ioser_get_chunk_data(ios, sect);
+	mutex_lock(&cdp->lock);
+	atomic_dec(&cdp->rcnt);
+	if (cdp->bio == NULL && bio_list_empty(&cdp->list))
+		raid1_ioser_write_and_unlock(mddev, cdp, bio);
+	else {
+		bio_list_add(&cdp->list, bio);
+		per_cpu_inc(ios->stat, ovct);
+		mutex_unlock(&cdp->lock);
+	}
+}
+
+static void
+raid1_ioser_write_request(struct mddev *mddev,
+		struct bio *bio, sector_t sectors)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1ioser *ios = &conf->ioser;
+
+	struct bio *split;
+	unsigned int bsz, ssz;
+	sector_t sct, beg, end;
+
+	if (!ios->init) {
+		raid1_write_request(mddev, bio, sectors);
+		return;
+	}
+	wait_event(ios->barr, atomic_read(&ios->stop) <= 0);
+	atomic_inc(&ios->ioct);
+	bsz = bio_sectors(bio);
+	if (likely(bsz > 0)) {
+		/*
+		 * Check for one page bio crossing chunks ...
+		 */
+		sct = bio->bi_iter.bi_sector;
+		beg = RAID1_IOSER_CHUNK_BASE(sct, ios->bits);
+		end = RAID1_IOSER_CHUNK_BASE(sct + bsz - 1, ios->bits);
+		if (unlikely(beg != end)) {
+			ssz = bsz - RAID1_IOSER_CHUNK_OFFS(sct + bsz, ios->bits);
+			split = bio_split(bio, ssz, GFP_NOIO, &mddev->bio_set);
+			if (unlikely(split == NULL))
+				pr_warn("%s: [%s] bio_split failed\n",
+					__func__, mdname(mddev));
+			else {
+				bio_chain(split, bio);
+				atomic_inc(&ios->ioct);
+				md_write_inc(mddev, split);
+				raid1_ioser_handle_bio(mddev, split);
+			}
+		}
+	}
+	raid1_ioser_handle_bio(mddev, bio);
+}
+
+static inline void raid1_ioser_gcol_chunk(struct r1chunk_addr *cap,
+					  struct r1chunk_data *cdp)
+{
+	struct r1ioser *ios;
+	struct r1conf *conf;
+	struct mddev *mddev;
+
+	unsigned long diff;
+
+	if (!mutex_trylock(&cdp->lock))
+		return;
+	if (atomic_read(&cdp->rcnt) > 0) {
+		cdp->wrnd = false;
+		mutex_unlock(&cdp->lock);
+		return;
+	}
+	diff = (long)jiffies - (long)(cdp->time);
+	if (diff <= (180 * HZ)) {
+		cdp->wrnd = false;
+		mutex_unlock(&cdp->lock);
+		return;
+	}
+	ios = cdp->ios;
+	if (cdp->bio != NULL || !bio_list_empty(&cdp->list)) {
+		if (cdp->wrnd) {
+			mutex_unlock(&cdp->lock);
+			return;
+		}
+		conf = container_of(ios, struct r1conf, ioser);
+		mddev = conf->mddev;
+		pr_warn("%s: [%s] %s IO at %lu after %u ms ?!\n",
+			__func__, mdname(mddev),
+			(cdp->bio != NULL ? "inflight" : "pending"),
+			cdp->sect, jiffies_to_msecs(diff));
+		cdp->wrnd = true;
+		mutex_unlock(&cdp->lock);
+		return;
+	}
+	mutex_unlock(&cdp->lock);
+	mutex_destroy(&cdp->lock);
+	rb_erase(&cdp->node, &cap->root);
+	mempool_free(cdp, ios->memp);
+}
+
+static void raid1_ioser_gcol_worker(struct work_struct *wkp)
+{
+	struct delayed_work *dwp = to_delayed_work(wkp);
+	struct r1ioser *ios = container_of(dwp, struct r1ioser, gcol);
+
+	struct r1chunk_addr *cap;
+	struct r1chunk_data *cdp;
+	struct rb_node *ndp;
+
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ios->addr); i++) {
+		cap = &ios->addr[i];
+		if (!mutex_trylock(&cap->lock))
+			continue;
+		ndp = rb_first(&cap->root);
+		while (ndp != NULL) {
+			cdp = rb_entry(ndp, struct r1chunk_data, node);
+			ndp = rb_next(ndp);
+			raid1_ioser_gcol_chunk(cap, cdp);
+		}
+		mutex_unlock(&cap->lock);
+		cond_resched();
+	}
+	if (likely(ios->init))
+		queue_delayed_work(system_long_wq, &ios->gcol, 100 * HZ);
+}
+
+static void raid1_ioser_set_queue_limits(struct r1ioser *ios)
+{
+	struct r1conf *conf = container_of(ios, struct r1conf, ioser);
+	struct mddev *mddev = conf->mddev;
+
+	unsigned char bits;
+	unsigned int bcnt;
+	unsigned int scnt;
+
+	if (ios->bits == 0)
+		return;
+	BUG_ON(mddev->queue == NULL);
+	bits = ilog2(queue_max_sectors(mddev->queue));
+	ios->bits = min_not_zero(bits, ios->bits);
+	if (ios->bits < RAID1_IOSER_MIN_CHUNK_BITS)
+		ios->bits = RAID1_IOSER_MIN_CHUNK_BITS;
+	scnt = 1U << ios->bits;
+	bcnt = 1U << (ios->bits + 9);
+	blk_queue_chunk_sectors(mddev->queue, scnt);
+	blk_queue_max_hw_sectors(mddev->queue, scnt);
+	blk_queue_io_opt(mddev->queue, bcnt);
+	if (blk_queue_discard(mddev->queue)) {
+		blk_queue_max_discard_sectors(mddev->queue, scnt);
+		mddev->queue->limits.discard_granularity = bcnt;
+		mddev->queue->limits.discard_alignment = bcnt;
+		blk_queue_max_discard_segments(mddev->queue, 1);
+	}
+	ios->init = true;
+	pr_info("%s: [%s] set up with %u KiB chunks\n",
+		__func__, mdname(mddev), 1U << (ios->bits - 1));
+	queue_delayed_work(system_long_wq, &ios->gcol, 100 * HZ);
+}
+
+static ssize_t
+md_raid1_ioser_chunk_size_show(struct mddev *mddev, char *page)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1ioser *ios = &conf->ioser;
+
+	return sprintf(page, "%u\n", (ios->bits > 0 ?
+			(1U << (ios->bits - 1)) : 0));
+}
+
+static struct md_sysfs_entry md_raid1_ioser_chunk_size_kb =
+__ATTR(ioser_chunk_size_kb, S_IRUGO, md_raid1_ioser_chunk_size_show, NULL);
+
+static ssize_t
+md_raid1_ioser_overlap_count_show(struct mddev *mddev, char *page)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1ioser *ios = &conf->ioser;
+
+	return sprintf(page, "%lu\n", per_cpu_get(ios->stat, ovct));
+}
+
+static ssize_t
+md_raid1_ioser_overlap_count_store(struct mddev *mddev, const char *buf, size_t len)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1ioser *ios = &conf->ioser;
+	long val;
+
+	if (kstrtol(buf, 0, &val) != 0 || val != 0)
+		return -EINVAL;
+
+	per_cpu_set(ios->stat, ovct, 0);
+	return len;
+}
+
+static struct md_sysfs_entry md_raid1_ioser_overlap_count =
+__ATTR(ioser_overlap_count, S_IRUGO|S_IWUSR,
+	md_raid1_ioser_overlap_count_show,
+	md_raid1_ioser_overlap_count_store);
+
+static struct attribute *md_raid1_ioser_attrs[] = {
+	&md_raid1_ioser_chunk_size_kb.attr,
+	&md_raid1_ioser_overlap_count.attr,
+	NULL,
+};
+
+static struct attribute_group md_raid1_ioser_group = {
+	.name = NULL,
+	.attrs = md_raid1_ioser_attrs,
+};
+
+static int __must_check
+raid1_ioser_init(struct r1ioser *ios)
+{
+	struct r1conf *conf = container_of(ios, struct r1conf, ioser);
+	struct mddev *mddev = conf->mddev;
+
+	struct kmem_cache *kcp;
+	char name[32];
+	int rc;
+
+	ios->bits = raid1_ioser_chunk_bits;
+	if (ios->bits == 0) {
+		BUG_ON(ios->init);
+		return 0;
+	}
+	for (rc = 0; rc < ARRAY_SIZE(ios->addr); rc++) {
+		mutex_init(&ios->addr[rc].lock);
+		ios->addr[rc].root = RB_ROOT;
+	}
+	init_waitqueue_head(&ios->barr);
+	init_waitqueue_head(&ios->wait);
+	INIT_DELAYED_WORK(&ios->gcol, raid1_ioser_gcol_worker);
+	snprintf(name, sizeof(name), "%s-ioser", mdname(mddev));
+	name[sizeof(name) - 1] = '\0';
+	kcp = kmem_cache_create(name, sizeof(struct r1chunk_data),
+				0, SLAB_HWCACHE_ALIGN, NULL);
+	if (unlikely(kcp == NULL)) {
+		pr_err("%s: [%s] kmem_cache_create failed\n",
+			__func__, mdname(mddev));
+		return -ENOMEM;
+	}
+	ios->memp = mempool_create_slab_pool(num_possible_cpus(), kcp);
+	if (unlikely(ios->memp == NULL)) {
+		pr_err("%s: [%s] mempool_create failed\n",
+			__func__, mdname(mddev));
+		kmem_cache_destroy(kcp);
+		return -ENOMEM;
+	}
+	ios->stat = alloc_percpu(struct r1ioser_stat);
+	if (unlikely(ios->stat == NULL)) {
+		pr_err("%s: [%s] alloc_percpu failed (%d)\n",
+			__func__, mdname(mddev), rc);
+		mempool_destroy(ios->memp);
+		kmem_cache_destroy(kcp);
+		ios->memp = NULL;
+		return -ENOMEM;
+	}
+	rc = raid1_ioser_init_wque(mddev, &ios->wque);
+	if (unlikely(rc < 0)) {
+		pr_err("%s: [%s] init_wque failed (%d)\n",
+			__func__, mdname(mddev), rc);
+		mempool_destroy(ios->memp);
+		kmem_cache_destroy(kcp);
+		ios->memp = NULL;
+		free_percpu(ios->stat);
+		ios->stat = NULL;
+		return rc;
+	}
+	if (likely(mddev->kobj.sd != NULL) &&
+	    sysfs_create_group(&mddev->kobj, &md_raid1_ioser_group))
+		pr_warn("%s: [%s] cannot register extra attributes\n",
+			__func__, mdname(mddev));
+	return 0;
+}
+
+static void raid1_ioser_exit(struct r1ioser *ios)
+{
+	struct r1conf *conf = container_of(ios, struct r1conf, ioser);
+	struct mddev *mddev = conf->mddev;
+
+	struct r1chunk_addr *cap;
+	struct r1chunk_data *cdp;
+	struct kmem_cache *kcp;
+	struct rb_node *ndp;
+	struct bio *bio;
+
+	int i;
+
+	if (ios->bits == 0)
+		return;
+	ios->init = false;
+	__raid1_ioser_wait_drain(ios);
+	if (likely(mddev->kobj.sd != NULL))
+		sysfs_remove_group(&mddev->kobj, &md_raid1_ioser_group);
+	cancel_delayed_work_sync(&ios->gcol);
+	raid1_ioser_exit_wque(mddev, &ios->wque);
+	for (i = 0; i < ARRAY_SIZE(ios->addr); i++) {
+		cap = &ios->addr[i];
+		mutex_lock(&cap->lock);
+		ndp = rb_first(&cap->root);
+		while (ndp != NULL) {
+			cdp = rb_entry(ndp, struct r1chunk_data, node);
+			ndp = rb_next(ndp);
+			mutex_lock(&cdp->lock);
+			WARN(cdp->bio != NULL, "%s: [%s] IO in flight\n",
+						__func__, mdname(mddev));
+			bio = bio_list_pop(&cdp->list);
+			while (unlikely(bio != NULL)) {
+				bio_io_error(bio);
+				bio = bio_list_pop(&cdp->list);
+			}
+			mutex_unlock(&cdp->lock);
+			mutex_destroy(&cdp->lock);
+			rb_erase(&cdp->node, &cap->root);
+			mempool_free(cdp, ios->memp);
+		}
+		mutex_unlock(&cap->lock);
+		mutex_destroy(&cap->lock);
+	}
+	if (likely(ios->memp != NULL)) {
+		kcp = ios->memp->pool_data;
+		mempool_destroy(ios->memp);
+		kmem_cache_destroy(kcp);
+		ios->memp = NULL;
+	}
+	if (likely(ios->stat != NULL)) {
+		free_percpu(ios->stat);
+		ios->stat = NULL;
+	}
+}
+
+static int raid1_ioser_chunk_size_show(char *cp, const struct kernel_param *kp)
+{
+	return sprintf(cp, "%u\n", (raid1_ioser_chunk_bits > 0 ?
+			(1U << (raid1_ioser_chunk_bits - 1)) : 0));
+}
+
+static int raid1_ioser_chunk_size_store(const char *cp, const struct kernel_param *kp)
+{
+	size_t size;
+	int bits;
+
+	if (kstrtoul(cp, 0, &size) != 0)
+		return -EINVAL;
+
+	if (size == 0) {
+		if (raid1_ioser_chunk_bits != 0) {
+			pr_info("%s: IO serialization disabled\n", __func__);
+			raid1_ioser_chunk_bits = 0;
+		}
+		return 0;
+	}
+	if (raid1_ioser_chunk_bits == 0)
+		pr_info("%s: IO serialization enabled\n", __func__);
+
+	bits = order_base_2(size) + 1;
+	raid1_ioser_chunk_bits = clamp(bits,
+		RAID1_IOSER_MIN_CHUNK_BITS, RAID1_IOSER_MAX_CHUNK_BITS);
+	return 0;
+}
+
+static const struct kernel_param_ops raid1_ioser_chunk_size_ops = {
+	.set = raid1_ioser_chunk_size_store,
+	.get = raid1_ioser_chunk_size_show,
+};
+
+module_param_cb(ioser_chunk_size_kb,
+		&raid1_ioser_chunk_size_ops,
+		NULL, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ioser_chunk_size_kb,
+		" IO serialization chunk size in KiB (zero disables it)");
+
+#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1d54109071cc..8154a5ce216a 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -83,6 +83,10 @@ static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
 
 #include "raid1-10.c"
 
+#ifdef CONFIG_MD_RAID1_IOSER
+#include "raid1-ioser.c"
+#endif
+
 /*
  * for resync bio, r1bio pointer can be retrieved from the per-bio
  * 'struct resync_pages'.
@@ -1065,6 +1069,9 @@ static void freeze_array(struct r1conf *conf, int extra)
 	 * get_unqueued_pendings(conf) gets equal to extra. For
 	 * normal I/O context, extra is 1, in rested situations extra is 0.
 	 */
+#ifdef CONFIG_MD_RAID1_IOSER
+	raid1_ioser_stop_and_drain(&conf->ioser);
+#endif
 	spin_lock_irq(&conf->resync_lock);
 	conf->array_frozen = 1;
 	raid1_log(conf->mddev, "wait freeze");
@@ -1074,7 +1081,12 @@ static void freeze_array(struct r1conf *conf, int extra)
 		conf->resync_lock,
 		flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
+
+#ifdef CONFIG_MD_RAID1_IOSER
+	raid1_ioser_start(&conf->ioser);
+#endif
 }
+
 static void unfreeze_array(struct r1conf *conf)
 {
 	/* reverse the effect of the freeze */
@@ -1536,6 +1548,11 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
 	sector_t sectors;
 
 	if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
+#ifdef CONFIG_MD_RAID1_IOSER
+		struct r1conf *conf = mddev->private;
+
+		raid1_ioser_wait_drain(&conf->ioser);
+#endif
 		md_flush_request(mddev, bio);
 		return true;
 	}
@@ -1555,7 +1572,11 @@ static bool raid1_make_request(struct mddev *mddev, struct bio *bio)
 	else {
 		if (!md_write_start(mddev,bio))
 			return false;
+#ifdef CONFIG_MD_RAID1_IOSER
+		raid1_ioser_write_request(mddev, bio, sectors);
+#else
 		raid1_write_request(mddev, bio, sectors);
+#endif
 	}
 	return true;
 }
@@ -3023,10 +3044,19 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	if (!conf->thread)
 		goto abort;
 
+#ifdef CONFIG_MD_RAID1_IOSER
+	err = raid1_ioser_init(&conf->ioser);
+	if (unlikely(err < 0))
+		goto abort;
+#endif
+
 	return conf;
 
  abort:
 	if (conf) {
+#ifdef CONFIG_MD_RAID1_IOSER
+		raid1_ioser_exit(&conf->ioser);
+#endif
 		mempool_exit(&conf->r1bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
@@ -3123,6 +3153,9 @@ static int raid1_run(struct mddev *mddev)
 		else
 			blk_queue_flag_clear(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
+#ifdef CONFIG_MD_RAID1_IOSER
+		raid1_ioser_set_queue_limits(&conf->ioser);
+#endif
 	}
 
 	ret =  md_integrity_register(mddev);
@@ -3136,6 +3169,9 @@ static int raid1_run(struct mddev *mddev)
 static void raid1_free(struct mddev *mddev, void *priv)
 {
 	struct r1conf *conf = priv;
+#ifdef CONFIG_MD_RAID1_IOSER
+	raid1_ioser_exit(&conf->ioser);
+#endif
 
 	mempool_exit(&conf->r1bio_pool);
 	kfree(conf->mirrors);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index e7ccad898736..2253561a40ac 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -65,6 +65,85 @@ struct pool_info {
 	int	raid_disks;
 };
 
+/* It's needed, when build MD_RAID1_IOSER as module */
+#ifdef CONFIG_MD_RAID1_IOSER_MODULE
+#define CONFIG_MD_RAID1_IOSER
+#endif
+
+#ifdef CONFIG_MD_RAID1_IOSER
+
+struct r1wque {
+	struct task_struct	*task;
+	struct llist_head	list;
+	wait_queue_head_t	wait;
+};
+
+struct r1ioser;
+
+struct r1chunk_data {
+	struct r1ioser	*ios;
+	struct rb_node	node;
+	struct bio	*bio;
+	struct bio_list	list;
+	sector_t	sect;
+	unsigned long	time;
+	struct mutex	lock;
+	atomic_t	rcnt;
+	bool		wrnd;
+};
+
+struct r1chunk_addr {
+	struct rb_root	root;
+	struct mutex	lock;
+};
+
+struct r1ioser_stat {
+	unsigned long	ovct;
+};
+
+#define RAID1_IOSER_ADDR_BITS	(7)
+#define RAID1_IOSER_ADDR_SIZE	(1U << RAID1_IOSER_ADDR_BITS)
+
+struct r1ioser {
+	struct r1chunk_addr	addr[RAID1_IOSER_ADDR_SIZE];
+	mempool_t		*memp;
+	struct r1wque 		wque;
+	struct delayed_work	gcol;
+	struct r1ioser_stat __percpu
+				*stat;
+	wait_queue_head_t	barr;	/* for stop */
+	wait_queue_head_t	wait;	/* for ioct */
+	atomic_t		ioct;
+	atomic_t		stop;
+	bool			init;
+	unsigned char		bits;
+};
+
+#define per_cpu_get(p, f)	(		\
+{						\
+	unsigned int __c;			\
+	typeof((p)->f) __r = 0;			\
+	for_each_possible_cpu(__c)		\
+		__r += per_cpu_ptr(p, __c)->f;	\
+	__r;					\
+}				)
+
+#define per_cpu_inc(p, f)			\
+do {						\
+	unsigned int __c = get_cpu();		\
+	per_cpu_ptr(p, __c)->f++;		\
+	put_cpu();				\
+} while (0)
+
+#define per_cpu_set(p, f, v)			\
+do {						\
+	unsigned int __c;			\
+	for_each_possible_cpu(__c)		\
+		per_cpu_ptr(p, __c)->f = v;	\
+} while (0)
+
+#endif
+
 struct r1conf {
 	struct mddev		*mddev;
 	struct raid1_info	*mirrors;	/* twice 'raid_disks' to
@@ -139,6 +218,9 @@ struct r1conf {
 	sector_t		cluster_sync_low;
 	sector_t		cluster_sync_high;
 
+#ifdef CONFIG_MD_RAID1_IOSER
+	struct r1ioser		ioser;
+#endif
 };
 
 /*
-- 
2.7.4