From: Florian-Ewald Mueller <florian-ewald.mueller@xxxxxxxxxxxxxxxx> This introduces the writable module parameter 'rq_mode' which is used to set the I/O mode for all subsequently created MD devices. Set it to 0 for the default mode (the make request function mode) in order to process I/O bio-by-bio or set it to 1 for the new request function mode to process I/O request-by-request. Common code is shared between both modes. The advantage of the new mode is that a scheduler can be used and the block layer cares for I/O statistics. Signed-off-by: Florian-Ewald Mueller <florian-ewald.mueller@xxxxxxxxxxxxxxxx> [spars: merged commits, changed description, fixed checkpatch warnings] Signed-off-by: Sebastian Parschauer <sebastian.riemer@xxxxxxxxxxxxxxxx> --- drivers/md/md.c | 280 +++++++++++++++++++++++++++++++++++++++++++++++++------ drivers/md/md.h | 7 ++ 2 files changed, 257 insertions(+), 30 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 8c653f9..0e5c420 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -56,8 +56,6 @@ #ifdef BIO_ACCOUNTING_EXTENSION -#include <linux/ratelimit.h> - struct md_bio_private { void (*orig_bio_endio)(struct bio *, int); void *orig_bio_private; @@ -68,6 +66,30 @@ struct md_bio_private { static struct kmem_cache *md_bio_private_cache __read_mostly; +#endif /* BIO_ACCOUNTING_EXTENSION */ + +#ifdef MD_REQUEST_FUNCTION + +struct md_request_clone { + struct work_struct work; + struct mddev *mdp; + struct request *req; + struct bio_list bios; + atomic_t cnt; + int err; +}; + +#define MD_RQ_MODE_DEFAULT 0 + +static unsigned int rq_mode __read_mostly = MD_RQ_MODE_DEFAULT; +static struct kmem_cache *md_request_clone_cache __read_mostly; + +#endif /* MD_REQUEST_FUNCTION */ + +#if defined BIO_ACCOUNTING_EXTENSION || defined MD_REQUEST_FUNCTION + +#include <linux/ratelimit.h> + static DEFINE_RATELIMIT_STATE(md_ratelimit_state, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -78,7 +100,7 @@ static inline int __must_check md_valid_ptr(const void *p) } #define VALID_PTR(p) md_valid_ptr(p) -#endif /* BIO_ACCOUNTING_EXTENSION */ +#endif /* BIO_ACCOUNTING_EXTENSION || MD_REQUEST_FUNCTION */ #ifndef MODULE static void autostart_arrays(int part); @@ -326,31 +348,17 @@ static void md_bio_endio(struct bio *bio, int err) #endif /* BIO_ACCOUNTING_EXTENSION */ -/* Rather than calling directly into the personality make_request function, - * IO requests come here first so that we can check if the device is - * being suspended pending a reconfiguration. - * We hold a refcount over the call to ->make_request. By the time that - * call has finished, the bio has been linked into some internal structure - * and so is visible to ->quiesce(), so we don't need the refcount any more. - */ -static void md_make_request(struct request_queue *q, struct bio *bio) +static inline int md_make_request_head(struct mddev *mddev, struct bio *bio) { const int rw = bio_data_dir(bio); - struct mddev *mddev = q->queuedata; - int cpu; - unsigned int sectors; -#ifdef BIO_ACCOUNTING_EXTENSION - struct md_bio_private *mbp; -#endif /* BIO_ACCOUNTING_EXTENSION */ - if (mddev == NULL || mddev->pers == NULL - || !mddev->ready) { + if (mddev == NULL || mddev->pers == NULL || !mddev->ready) { bio_io_error(bio); - return; + return 1; } if (mddev->ro == 1 && unlikely(rw == WRITE)) { bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); - return; + return 1; } smp_rmb(); /* Ensure implications of 'active' are visible */ rcu_read_lock(); @@ -369,6 +377,39 @@ static void md_make_request(struct request_queue *q, struct bio *bio) } atomic_inc(&mddev->active_io); rcu_read_unlock(); + return 0; +} + +static inline void md_make_request_body(struct mddev *mddev, struct bio *bio) +{ + mddev->pers->make_request(mddev, bio); +} + +static inline void md_make_request_tail(struct mddev *mddev, struct bio *bio) +{ + if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) + wake_up(&mddev->sb_wait); +} + +/* Rather than calling directly into the personality make_request function, + * IO requests come here first so that we can check if the device is + * being suspended pending a reconfiguration. + * We hold a refcount over the call to ->make_request. By the time that + * call has finished, the bio has been linked into some internal structure + * and so is visible to ->quiesce(), so we don't need the refcount any more. + */ +static void md_make_request(struct request_queue *q, struct bio *bio) +{ + const int rw = bio_data_dir(bio); + struct mddev *mddev = q->queuedata; + int cpu; + unsigned int sectors; +#ifdef BIO_ACCOUNTING_EXTENSION + struct md_bio_private *mbp; +#endif /* BIO_ACCOUNTING_EXTENSION */ + + if (unlikely(md_make_request_head(mddev, bio))) + return; /* * save the sectors now since our bio can @@ -397,7 +438,7 @@ static void md_make_request(struct request_queue *q, struct bio *bio) bio->bi_private = mbp; } #endif /* BIO_ACCOUNTING_EXTENSION */ - mddev->pers->make_request(mddev, bio); + md_make_request_body(mddev, bio); #ifndef BIO_ACCOUNTING_EXTENSION cpu = part_stat_lock(); @@ -406,10 +447,131 @@ static void md_make_request(struct request_queue *q, struct bio *bio) part_stat_unlock(); #endif /* !BIO_ACCOUNTING_EXTENSION */ - if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) - wake_up(&mddev->sb_wait); + md_make_request_tail(mddev, bio); +} + +#ifdef MD_REQUEST_FUNCTION + +static inline void md_make_request_bio(struct mddev *mddev, struct bio *bio) +{ + if (unlikely(md_make_request_head(mddev, bio))) + return; + md_make_request_body(mddev, bio); + md_make_request_tail(mddev, bio); +} + +static inline void md_request_clone_release(struct md_request_clone *rcl) +{ + if (atomic_dec_and_test(&rcl->cnt)) { + blk_end_request_all(rcl->req, rcl->err); + kmem_cache_free(md_request_clone_cache, rcl); + } +} + +static void md_request_bio_endio(struct bio *bio, int err) +{ + struct md_request_clone *rcl = bio->bi_private; + + if (unlikely(err < 0)) + rcl->err = err; + + bio_put(bio); + md_request_clone_release(rcl); +} + +static void md_request_clone_worker(struct work_struct *wkp) +{ + struct md_request_clone *rcl = + container_of(wkp, struct md_request_clone, work); + struct bio_list *blp = &rcl->bios; + struct mddev *mddev = rcl->mdp; + struct bio *bio; + + bio = bio_list_pop(blp); + while (VALID_PTR(bio)) { + md_make_request_bio(mddev, bio); + bio = bio_list_pop(blp); + } + md_request_clone_release(rcl); } +static inline int md_process_request(struct mddev *mddev, struct request *req) +{ + struct md_request_clone *rcl; + + struct bio *bio, *clone; + int error; + + rcl = kmem_cache_alloc(md_request_clone_cache, GFP_NOIO); + if (unlikely(!VALID_PTR(rcl))) { + if (__ratelimit(&md_ratelimit_state)) + pr_warn("%s: [%s] kmem_cache_alloc failed\n", + __func__, mdname(mddev)); + return -ENOMEM; + } + rcl->err = 0; + rcl->req = req; + rcl->mdp = mddev; + atomic_set(&rcl->cnt, 1); + bio_list_init(&rcl->bios); + bio = req->bio; + while (VALID_PTR(bio)) { + clone = bio_clone(bio, GFP_NOWAIT); + if (unlikely(!VALID_PTR(clone))) { + if (__ratelimit(&md_ratelimit_state)) + pr_warn("%s: [%s] bio_clone failed\n", + __func__, mdname(mddev)); + error = -ENOMEM; + goto error_out; + } + clone->bi_private = rcl; + clone->bi_end_io = md_request_bio_endio; + bio_list_add(&rcl->bios, clone); + atomic_inc(&rcl->cnt); + bio = bio->bi_next; + } + INIT_WORK(&rcl->work, md_request_clone_worker); + queue_work(mddev->request_wq, &rcl->work); + return 0; +error_out: + bio = bio_list_pop(&rcl->bios); + while (VALID_PTR(bio)) { + bio_put(bio); + bio = bio_list_pop(&rcl->bios); + } + kmem_cache_free(md_request_clone_cache, rcl); + return error; +} + +#ifndef blk_fs_request +#define blk_fs_request(p) ((p)->cmd_type == REQ_TYPE_FS) +#endif /* !blk_fs_request */ + +static void md_request_function(struct request_queue *rqp) +{ + struct mddev *mddev = rqp->queuedata; + + struct request *req; + int rc; + + while ((req = blk_fetch_request(rqp)) != NULL) { + if (unlikely(!blk_fs_request(req))) { + if (__ratelimit(&md_ratelimit_state)) + pr_warn("%s: [%s] non-fs request\n", + __func__, mdname(mddev)); + __blk_end_request_all(req, -ENOTSUPP); + continue; + } + spin_unlock_irq(rqp->queue_lock); + rc = md_process_request(mddev, req); + spin_lock_irq(rqp->queue_lock); + if (unlikely(rc < 0)) + __blk_end_request_all(req, rc); + } +} + +#endif /* MD_REQUEST_FUNCTION */ + /* mddev_suspend makes sure no new requests are submitted * to the device, and that any requests that have been submitted * are completely handled. @@ -567,8 +729,15 @@ static void mddev_put(struct mddev *mddev) */ INIT_WORK(&mddev->del_work, mddev_delayed_delete); queue_work(md_misc_wq, &mddev->del_work); - } else + } else { +#ifdef MD_REQUEST_FUNCTION + if (likely(VALID_PTR(mddev->request_wq))) { + destroy_workqueue(mddev->request_wq); + mddev->request_wq = NULL; + } +#endif /* MD_REQUEST_FUNCTION */ kfree(mddev); + } } spin_unlock(&all_mddevs_lock); if (bs) @@ -4923,6 +5092,13 @@ static void md_free(struct kobject *ko) if (mddev->queue) blk_cleanup_queue(mddev->queue); +#ifdef MD_REQUEST_FUNCTION + if (likely(VALID_PTR(mddev->request_wq))) { + destroy_workqueue(mddev->request_wq); + mddev->request_wq = NULL; + } +#endif /* MD_REQUEST_FUNCTION */ + kfree(mddev); } @@ -4990,12 +5166,32 @@ static int md_alloc(dev_t dev, char *name) } error = -ENOMEM; - mddev->queue = blk_alloc_queue(GFP_KERNEL); - if (!mddev->queue) - goto abort; +#ifdef MD_REQUEST_FUNCTION + if (!rq_mode) { +#endif /* MD_REQUEST_FUNCTION */ + mddev->queue = blk_alloc_queue(GFP_KERNEL); + if (!mddev->queue) + goto abort; + blk_queue_make_request(mddev->queue, md_make_request); +#ifdef MD_REQUEST_FUNCTION + } else { + mddev->request_wq = + create_singlethread_workqueue(mdname(mddev)); + if (unlikely(!VALID_PTR(mddev->request_wq))) { + pr_warn("%s: create_singlethread_workqueue (%s) " + "failed\n", __func__, mdname(mddev)); + goto abort; + } + mddev->queue = blk_init_queue(md_request_function, NULL); + if (!mddev->queue) { + destroy_workqueue(mddev->request_wq); + mddev->request_wq = NULL; + goto abort; + } + } +#endif /* MD_REQUEST_FUNCTION */ mddev->queue->queuedata = mddev; - blk_queue_make_request(mddev->queue, md_make_request); blk_set_stacking_limits(&mddev->queue->limits); disk = alloc_disk(1 << shift); @@ -8714,11 +8910,23 @@ static int __init md_init(void) #ifdef BIO_ACCOUNTING_EXTENSION md_bio_private_cache = KMEM_CACHE(md_bio_private, 0); if (unlikely(!VALID_PTR(md_bio_private_cache))) { - pr_err("%s: KMEM_CACHE failed\n", __func__); + pr_err("%s: KMEM_CACHE (bio_priv) failed\n", __func__); return -ENOMEM; } #endif /* BIO_ACCOUNTING_EXTENSION */ +#ifdef MD_REQUEST_FUNCTION + md_request_clone_cache = KMEM_CACHE(md_request_clone, 0); + if (unlikely(!VALID_PTR(md_request_clone_cache))) { + pr_err("%s: KMEM_CACHE (req_clone) failed\n", __func__); +#ifdef BIO_ACCOUNTING_EXTENSION + kmem_cache_destroy(md_bio_private_cache); + md_bio_private_cache = NULL; +#endif /* BIO_ACCOUNTING_EXTENSION */ + return -ENOMEM; + } +#endif /* MD_REQUEST_FUNCTION */ + md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); if (!md_wq) goto err_wq; @@ -8856,6 +9064,13 @@ static __exit void md_exit(void) destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); +#ifdef MD_REQUEST_FUNCTION + if (likely(VALID_PTR(md_request_clone_cache))) { + kmem_cache_destroy(md_request_clone_cache); + md_request_clone_cache = NULL; + } +#endif /* MD_REQUEST_FUNCTION */ + #ifdef BIO_ACCOUNTING_EXTENSION if (likely(VALID_PTR(md_bio_private_cache))) { kmem_cache_destroy(md_bio_private_cache); @@ -8887,6 +9102,11 @@ module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); +#ifdef MD_REQUEST_FUNCTION +module_param(rq_mode, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(rq_mode, " this module's io input mode (default: 0 [make request mode])"); +#endif /* MD_REQUEST_FUNCTION */ + EXPORT_SYMBOL(register_md_personality); EXPORT_SYMBOL(unregister_md_personality); EXPORT_SYMBOL(md_error); diff --git a/drivers/md/md.h b/drivers/md/md.h index f0e9171..8d639e0 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -25,6 +25,10 @@ #include <linux/workqueue.h> #if 1 +#define MD_REQUEST_FUNCTION +#endif + +#if 1 #define BIO_ACCOUNTING_EXTENSION #endif @@ -455,6 +459,9 @@ struct mddev { #ifdef BIO_ACCOUNTING_EXTENSION struct md_stats stats; #endif /* BIO_ACCOUNTING_EXTENSION */ +#ifdef MD_REQUEST_FUNCTION + struct workqueue_struct *request_wq; +#endif /* MD_REQUEST_FUNCTION */ }; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html