Hi all,
this is a proposal for a different implementation of request
callbacks. The existing ->endio callback of a request is actually a
destructor function, to be called to terminate a request and free all
structures.
However, on certain occasions (like request-based multipathing) it is
desirable to have a callback function for a request which is called
right after the request is finished, ie in end_that_request_first()
before any bio->bi_endio callback is called.
So a simple solution for this is to clone the request and add a new
'marker' bio in front of the bio list of the request. This callback
will be attached a structure in bi_private which keeps a pointer to
the cloned and the original request, thus serving as a callback for
the request itself.
Proposed patch attached. As usual comments are welcome.
Cheers,
Hannes
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index a15845c..c286f05 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -40,7 +40,6 @@ static void blk_unplug_work(struct work_struct *work);
static void blk_unplug_timeout(unsigned long data);
static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
static void init_request_from_bio(struct request *req, struct bio *bio);
-static int __make_request(struct request_queue *q, struct bio *bio);
static struct io_context *current_io_context(gfp_t gfp_flags, int node);
/*
@@ -2912,7 +2911,7 @@ static void init_request_from_bio(struct request *req, struct bio *bio)
req->start_time = jiffies;
}
-static int __make_request(struct request_queue *q, struct bio *bio)
+int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
int el_ret, nr_sectors, barrier, err;
@@ -3030,6 +3029,7 @@ end_io:
bio_endio(bio, nr_sectors << 9, err);
return 0;
}
+EXPORT_SYMBOL(__make_request);
/*
* If bio->bi_dev is a partition, remap the location
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d6ca9d0..76acda2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1,6 +1,7 @@
/*
* Copyright (C) 2003 Sistina Software Limited.
* Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved.
+ * Copyright (C) 2007 NEC Corporation.
*
* This file is released under the GPL.
*/
@@ -8,8 +9,7 @@
#include "dm.h"
#include "dm-path-selector.h"
#include "dm-hw-handler.h"
-#include "dm-bio-list.h"
-#include "dm-bio-record.h"
+#include "dm-rq-record.h"
#include <linux/ctype.h>
#include <linux/init.h>
@@ -77,7 +77,7 @@ struct multipath {
unsigned saved_queue_if_no_path;/* Saved state during suspension */
struct work_struct process_queued_ios;
- struct bio_list queued_ios;
+ struct list_head queued_ios;
unsigned queue_size;
struct work_struct trigger_event;
@@ -94,7 +94,7 @@ struct multipath {
*/
struct dm_mpath_io {
struct pgpath *pgpath;
- struct dm_bio_details details;
+ struct dm_rq_details details;
};
typedef int (*action_fn) (struct pgpath *pgpath);
@@ -171,6 +171,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
m = kzalloc(sizeof(*m), GFP_KERNEL);
if (m) {
INIT_LIST_HEAD(&m->priority_groups);
+ INIT_LIST_HEAD(&m->queued_ios);
spin_lock_init(&m->lock);
m->queue_io = 1;
INIT_WORK(&m->process_queued_ios, process_queued_ios);
@@ -180,6 +181,9 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
kfree(m);
return NULL;
}
+
+ dm_table_enable_elevator(ti->table);
+
m->ti = ti;
ti->private = m;
}
@@ -202,6 +206,8 @@ static void free_multipath(struct multipath *m)
dm_put_hw_handler(hwh->type);
}
+ dm_table_disable_elevator(m->ti->table);
+
mempool_destroy(m->mpio_pool);
kfree(m);
}
@@ -299,7 +305,7 @@ static int __must_push_back(struct multipath *m)
dm_noflush_suspending(m->ti));
}
-static int map_io(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
struct dm_mpath_io *mpio, unsigned was_queued)
{
int r = DM_MAPIO_REMAPPED;
@@ -321,22 +327,30 @@ static int map_io(struct multipath *m, struct bio *bio,
if ((pgpath && m->queue_io) ||
(!pgpath && m->queue_if_no_path)) {
/* Queue for the daemon to resubmit */
- bio_list_add(&m->queued_ios, bio);
+ list_add_tail(&clone->queuelist, &m->queued_ios);
m->queue_size++;
if ((m->pg_init_required && !m->pg_init_in_progress) ||
!m->queue_io)
queue_work(kmultipathd, &m->process_queued_ios);
pgpath = NULL;
+ clone->q = NULL;
+ clone->rq_disk = NULL;
r = DM_MAPIO_SUBMITTED;
- } else if (pgpath)
- bio->bi_bdev = pgpath->path.dev->bdev;
- else if (__must_push_back(m))
- r = DM_MAPIO_REQUEUE;
- else
+ } else if (pgpath) {
+ clone->q = bdev_get_queue(pgpath->path.dev->bdev);
+ clone->rq_disk = pgpath->path.dev->bdev->bd_disk;
+ } else {
+ printk(KERN_INFO "request %p: failed to map\n", clone);
+ clone->q = NULL;
+ clone->rq_disk = NULL;
r = -EIO; /* Failed */
+ }
mpio->pgpath = pgpath;
+ if (r == DM_MAPIO_REMAPPED && m->current_pg->ps.type->start_io)
+ m->current_pg->ps.type->start_io(&m->current_pg->ps,
+ &pgpath->path, clone);
spin_unlock_irqrestore(&m->lock, flags);
return r;
@@ -373,30 +387,38 @@ static void dispatch_queued_ios(struct multipath *m)
{
int r;
unsigned long flags;
- struct bio *bio = NULL, *next;
struct dm_mpath_io *mpio;
union map_info *info;
+ struct request *clone, *n;
+ LIST_HEAD(cl);
spin_lock_irqsave(&m->lock, flags);
- bio = bio_list_get(&m->queued_ios);
+ list_splice_init(&m->queued_ios, &cl);
spin_unlock_irqrestore(&m->lock, flags);
- while (bio) {
- next = bio->bi_next;
- bio->bi_next = NULL;
+ list_for_each_entry_safe(clone, n, &cl, queuelist) {
+ struct bio *bio = clone->bio;
+ list_del(&clone->queuelist);
+
+ BUG_ON(!bio);
- info = dm_get_mapinfo(bio);
+ info = dm_get_mapinfo(clone->bio);
mpio = info->ptr;
- r = map_io(m, bio, mpio, 1);
- if (r < 0)
+ r = map_io(m, clone, mpio, 1);
+ if (r < 0) {
+ printk(KERN_INFO "%s: map clone %p failed\n",
+ __FUNCTION__, clone);
+ /* FIXME */
bio_endio(bio, bio->bi_size, r);
- else if (r == DM_MAPIO_REMAPPED)
- generic_make_request(bio);
- else if (r == DM_MAPIO_REQUEUE)
+ } else if (r == DM_MAPIO_REMAPPED)
+ dm_dispatch_request(clone->q, clone);
+ else if (r == DM_MAPIO_REQUEUE) {
+ printk(KERN_INFO "%s: requeue map clone %p\n",
+ __FUNCTION__, clone);
+ /* FIXME */
bio_endio(bio, bio->bi_size, -EIO);
-
- bio = next;
+ }
}
}
@@ -789,23 +811,29 @@ static void multipath_dtr(struct dm_target *ti)
}
/*
- * Map bios, recording original fields for later in case we have to resubmit
+ * Map cloned requests
*/
-static int multipath_map(struct dm_target *ti, struct bio *bio,
+static int multipath_map(struct dm_target *ti, struct request *clone,
union map_info *map_context)
{
int r;
struct dm_mpath_io *mpio;
struct multipath *m = (struct multipath *) ti->private;
+ if (blk_barrier_rq(clone))
+ return -EOPNOTSUPP;
+
mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
- dm_bio_record(&mpio->details, bio);
+ memset(mpio, 0, sizeof(mpio));
+ clone->cmd_flags |= REQ_FAILFAST;
+ dm_rq_record(&mpio->details, clone);
map_context->ptr = mpio;
- bio->bi_rw |= (1 << BIO_RW_FAILFAST);
- r = map_io(m, bio, mpio, 0);
- if (r < 0 || r == DM_MAPIO_REQUEUE)
+ r = map_io(m, clone, mpio, 0);
+ if (r < 0 || r == DM_MAPIO_REQUEUE) {
mempool_free(mpio, m->mpio_pool);
+ map_context->ptr = NULL;
+ }
return r;
}
@@ -1007,25 +1035,37 @@ void dm_pg_init_complete(struct dm_path *path, unsigned err_flags)
spin_unlock_irqrestore(&m->lock, flags);
}
+static void multipath_queue_in_tgt(struct dm_target *ti, struct request *clone,
+ union map_info *map_context)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ struct dm_mpath_io *mpio = (struct dm_mpath_io *) map_context->ptr;
+ unsigned long flags = 0UL;
+
+ dm_rq_restore(&mpio->details, clone);
+
+ /* queue for the daemon to resubmit or fail */
+ spin_lock_irqsave(&m->lock, flags);
+ list_add_tail(&clone->queuelist, &m->queued_ios);
+ m->queue_size++;
+ if (!m->queue_io)
+ queue_work(kmultipathd, &m->process_queued_ios);
+ spin_unlock_irqrestore(&m->lock, flags);
+}
+
/*
* end_io handling
*/
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
int error, struct dm_mpath_io *mpio)
{
struct hw_handler *hwh = &m->hw_handler;
unsigned err_flags = MP_FAIL_PATH; /* Default behavior */
- unsigned long flags;
+ unsigned long flags = 0UL;
- if (!error)
+ if (!clone->errors && !error)
return 0; /* I/O complete */
- if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
- return error;
-
- if (error == -EOPNOTSUPP)
- return error;
-
spin_lock_irqsave(&m->lock, flags);
if (!m->nr_valid_paths) {
if (__must_push_back(m)) {
@@ -1041,9 +1081,6 @@ static int do_end_io(struct multipath *m, struct bio *bio,
}
spin_unlock_irqrestore(&m->lock, flags);
- if (hwh->type && hwh->type->error)
- err_flags = hwh->type->error(hwh, bio);
-
if (mpio->pgpath) {
if (err_flags & MP_FAIL_PATH)
fail_path(mpio->pgpath);
@@ -1056,21 +1093,14 @@ static int do_end_io(struct multipath *m, struct bio *bio,
return -EIO;
requeue:
- dm_bio_restore(&mpio->details, bio);
-
- /* queue for the daemon to resubmit or fail */
- spin_lock_irqsave(&m->lock, flags);
- bio_list_add(&m->queued_ios, bio);
- m->queue_size++;
- if (!m->queue_io)
- queue_work(kmultipathd, &m->process_queued_ios);
- spin_unlock_irqrestore(&m->lock, flags);
-
return DM_ENDIO_INCOMPLETE; /* io not complete */
}
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
- int error, union map_info *map_context)
+/*
+ * clone->q's lock must be held
+ */
+static int multipath_end_io_first(struct dm_target *ti, struct request *clone,
+ int error, union map_info *map_context)
{
struct multipath *m = ti->private;
struct dm_mpath_io *mpio = map_context->ptr;
@@ -1078,18 +1108,34 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
struct path_selector *ps;
int r;
- r = do_end_io(m, bio, error, mpio);
+ r = do_end_io(m, clone, error, mpio);
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path);
+ ps->type->end_io(ps, &pgpath->path, clone);
}
- if (r != DM_ENDIO_INCOMPLETE)
+ if (r != DM_ENDIO_INCOMPLETE && mpio) {
mempool_free(mpio, m->mpio_pool);
+ map_context->ptr = NULL;
+ }
return r;
}
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
+ int error, union map_info *map_context)
+{
+ struct multipath *m = (struct multipath *) ti->private;
+ struct dm_mpath_io *mpio = (struct dm_mpath_io *) map_context->ptr;
+
+ if (mpio) {
+ mempool_free(mpio, m->mpio_pool);
+ map_context->ptr = NULL;
+ }
+
+ return 0;
+}
+
/*
* Suspend can't complete until all the I/O is processed so if
* the last path fails we must error any remaining I/O.
@@ -1329,8 +1375,10 @@ static struct target_type multipath_target = {
.module = THIS_MODULE,
.ctr = multipath_ctr,
.dtr = multipath_dtr,
- .map = multipath_map,
- .end_io = multipath_end_io,
+ .map_rq = multipath_map,
+ .rq_end_io_first = multipath_end_io_first,
+ .rq_end_io = multipath_end_io,
+ .queue_in_tgt = multipath_queue_in_tgt,
.presuspend = multipath_presuspend,
.resume = multipath_resume,
.status = multipath_status,
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b8..089a979 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -75,7 +75,10 @@ struct path_selector_type {
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
- int (*end_io) (struct path_selector *ps, struct dm_path *path);
+ int (*start_io) (struct path_selector *ps, struct dm_path *path,
+ struct request *clone);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path,
+ struct request *clone);
};
/* Register a path selector */
diff --git a/drivers/md/dm-rq-record.h b/drivers/md/dm-rq-record.h
new file mode 100644
index 0000000..4e09598
--- /dev/null
+++ b/drivers/md/dm-rq-record.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (C) 2007 NEC Corporation.
+ *
+ * This file is released under the GPL.
+ */
+
+#ifndef DM_RQ_RECORD_H
+#define DM_RQ_RECORD_H
+
+#include <linux/blkdev.h>
+
+/*
+ * There are lots of mutable fields in the request struct that get
+ * changed by the lower levels of the block layer. Some targets,
+ * such as multipath, may wish to resubmit a request on error. The
+ * functions in this file help the target record and restore the
+ * original request state.
+ */
+struct dm_rq_details {
+ unsigned int cmd_flags;
+ int ref_count;
+};
+
+static inline void dm_rq_record(struct dm_rq_details *rd, struct request *rq)
+{
+ rd->cmd_flags = rq->cmd_flags;
+ rd->ref_count = rq->ref_count;
+}
+
+static inline void dm_rq_restore(struct dm_rq_details *rd, struct request *rq)
+{
+ rq->cmd_flags = rd->cmd_flags;
+ rq->ref_count = rd->ref_count;
+}
+
+#endif
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 2bcde57..ad0c85e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1033,6 +1033,28 @@ struct mapped_device *dm_table_get_md(struct dm_table *t)
return t->md;
}
+int dm_table_enable_elevator(struct dm_table *t)
+{
+ struct mapped_device *md = dm_table_get_md(t);
+
+ if (!md)
+ return 1;
+
+ if (dm_enable_elevator(md)) {
+ dm_put(md);
+ return 1;
+ }
+ dm_put(md);
+
+ return 0;
+}
+
+void dm_table_disable_elevator(struct dm_table *t)
+{
+ /* No reference taken */
+ dm_disable_elevator(t->md);
+}
+
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
@@ -1044,3 +1066,5 @@ EXPORT_SYMBOL(dm_table_put);
EXPORT_SYMBOL(dm_table_get);
EXPORT_SYMBOL(dm_table_unplug_all);
EXPORT_SYMBOL(dm_table_flush_all);
+EXPORT_SYMBOL(dm_table_enable_elevator);
+EXPORT_SYMBOL(dm_table_disable_elevator);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 2120155..7ac9527 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,8 @@
#define DM_MSG_PREFIX "core"
+#define DM_DEBUG
+
static const char *_name = DM_NAME;
static unsigned int major = 0;
@@ -31,13 +33,19 @@ static unsigned int _major = 0;
static DEFINE_SPINLOCK(_minor_lock);
/*
- * One of these is allocated per bio.
+ * One of these is allocated per bio / request.
*/
struct dm_io {
struct mapped_device *md;
int error;
- struct bio *bio;
- atomic_t io_count;
+ union {
+ struct bio *bio;
+ struct request *req;
+ } data;
+ union {
+ atomic_t io_count;
+ struct request *orig;
+ } parm;
unsigned long start_time;
};
@@ -48,6 +56,7 @@ struct dm_io {
struct dm_target_io {
struct dm_io *io;
struct dm_target *ti;
+ struct dm_table *map;
union map_info info;
};
@@ -103,6 +112,7 @@ struct mapped_device {
* io objects are allocated from here.
*/
mempool_t *io_pool;
+ mempool_t *rq_pool;
mempool_t *tio_pool;
struct bio_set *bs;
@@ -125,6 +135,7 @@ struct mapped_device {
#define MIN_IOS 256
static struct kmem_cache *_io_cache;
+static struct kmem_cache *_rq_cache;
static struct kmem_cache *_tio_cache;
static int __init local_init(void)
@@ -143,6 +154,14 @@ static int __init local_init(void)
return -ENOMEM;
}
+ _rq_cache = kmem_cache_create("dm_rq", sizeof(struct request),
+ 0, 0, NULL);
+ if (!_rq_cache) {
+ kmem_cache_destroy(_tio_cache);
+ kmem_cache_destroy(_io_cache);
+ return -ENOMEM;
+ }
+
_major = major;
r = register_blkdev(_major, _name);
if (r < 0) {
@@ -160,6 +179,7 @@ static int __init local_init(void)
static void local_exit(void)
{
kmem_cache_destroy(_tio_cache);
+ kmem_cache_destroy(_rq_cache);
kmem_cache_destroy(_io_cache);
unregister_blkdev(_major, _name);
@@ -341,6 +361,16 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
mempool_free(tio, md->tio_pool);
}
+static inline struct request *alloc_rq(struct mapped_device *md)
+{
+ return mempool_alloc(md->rq_pool, GFP_NOIO);
+}
+
+static inline void free_rq(struct mapped_device *md, struct request *rq)
+{
+ mempool_free(rq, md->rq_pool);
+}
+
static void start_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
@@ -356,7 +386,7 @@ static void start_io_acct(struct dm_io *io)
static int end_io_acct(struct dm_io *io)
{
struct mapped_device *md = io->md;
- struct bio *bio = io->bio;
+ struct bio *bio = io->data.bio;
unsigned long duration = jiffies - io->start_time;
int pending;
int rw = bio_data_dir(bio);
@@ -434,6 +464,94 @@ int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
return 0;
}
+/*
+ * The queue is only valid as long as you have a reference
+ * count on 'md'.
+ */
+struct request_queue *dm_get_queue(struct mapped_device *md)
+{
+ if (blk_get_queue(md->queue))
+ return NULL;
+
+ return md->queue;
+}
+EXPORT_SYMBOL_GPL(dm_get_queue);
+
+void dm_put_queue(struct request_queue *q)
+{
+ blk_put_queue(q);
+}
+EXPORT_SYMBOL_GPL(dm_put_queue);
+
+void dm_stop_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ blk_stop_queue(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_stop_queue);
+
+void dm_start_queue(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (blk_queue_stopped(q))
+ blk_start_queue(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_start_queue);
+
+void __dm_requeue_request(struct request_queue *q, struct request *rq)
+{
+ if (elv_queue_empty(q))
+ blk_plug_device(q);
+ blk_requeue_request(q, rq);
+}
+EXPORT_SYMBOL_GPL(__dm_requeue_request);
+
+void dm_requeue_request(struct request_queue *q, struct request *rq)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+ __dm_requeue_request(q, rq);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_request);
+
+void dm_dispatch_request(struct request_queue *q, struct request *rq)
+{
+ int where = ELEVATOR_INSERT_BACK;
+ unsigned long flags;
+
+ spin_lock_irqsave(q->queue_lock, flags);
+
+ /* Initialize statistic stuff */
+ disk_round_stats(rq->rq_disk);
+ rq->start_time = jiffies;
+ rq->rq_disk->in_flight++;
+
+ __elv_add_request(q, rq, where, 0);
+
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static void kill_request(struct request *rq)
+{
+ int nr_bytes = rq->hard_nr_sectors << 9;
+
+ if (!nr_bytes)
+ nr_bytes = rq->data_len;
+
+ rq->cmd_flags |= REQ_QUIET;
+ end_that_request_first(rq, 0, nr_bytes);
+ end_that_request_last(rq, 0);
+}
+
/*-----------------------------------------------------------------
* CRUD START:
* A more elegant soln is in the works that uses the queue
@@ -460,7 +578,7 @@ static void dec_pending(struct dm_io *io, int error)
if (error && !(io->error > 0 && __noflush_suspending(io->md)))
io->error = error;
- if (atomic_dec_and_test(&io->io_count)) {
+ if (atomic_dec_and_test(&io->parm.io_count)) {
if (io->error == DM_ENDIO_REQUEUE) {
/*
* Target requested pushing back the I/O.
@@ -469,7 +587,7 @@ static void dec_pending(struct dm_io *io, int error)
*/
spin_lock_irqsave(&io->md->pushback_lock, flags);
if (__noflush_suspending(io->md))
- bio_list_add(&io->md->pushback, io->bio);
+ bio_list_add(&io->md->pushback, io->data.bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
@@ -481,10 +599,10 @@ static void dec_pending(struct dm_io *io, int error)
wake_up(&io->md->wait);
if (io->error != DM_ENDIO_REQUEUE) {
- blk_add_trace_bio(io->md->queue, io->bio,
+ blk_add_trace_bio(io->md->queue, io->data.bio,
BLK_TA_COMPLETE);
- bio_endio(io->bio, io->bio->bi_size, io->error);
+ bio_endio(io->data.bio, io->data.bio->bi_size, io->error);
}
free_io(io->md, io);
@@ -533,6 +651,30 @@ static int clone_endio(struct bio *bio, unsigned int done, int error)
return r;
}
+static void blk_update_cloned_rq(struct request *rq, struct request *clone)
+{
+ clone->nr_phys_segments = rq->nr_phys_segments;
+ clone->nr_hw_segments = rq->nr_hw_segments;
+ clone->current_nr_sectors = rq->current_nr_sectors;
+ clone->hard_cur_sectors = rq->hard_cur_sectors;
+ clone->hard_nr_sectors = rq->hard_nr_sectors;
+ clone->nr_sectors = rq->nr_sectors;
+ clone->hard_sector = rq->hard_sector;
+ clone->sector = rq->sector;
+ clone->data_len = rq->data_len;
+ clone->buffer = rq->buffer;
+ clone->data = rq->data;
+ clone->bio = rq->bio;
+ clone->biotail = rq->biotail;
+}
+
+static void dec_rq_pending(struct dm_target_io *tio)
+{
+ if (!atomic_dec_return(&tio->io->md->pending))
+ /* nudge anyone waiting on suspend queue */
+ wake_up(&tio->io->md->wait);
+}
+
static sector_t max_io_len(struct mapped_device *md,
sector_t sector, struct dm_target *ti)
{
@@ -573,14 +715,14 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
* anything, the target has assumed ownership of
* this io.
*/
- atomic_inc(&tio->io->io_count);
+ atomic_inc(&tio->io->parm.io_count);
sector = clone->bi_sector;
r = ti->type->map(ti, clone, &tio->info);
if (r == DM_MAPIO_REMAPPED) {
/* the bio has been remapped so dispatch it */
blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone,
- tio->io->bio->bi_bdev->bd_dev,
+ tio->io->data.bio->bi_bdev->bd_dev,
clone->bi_sector, sector);
generic_make_request(clone);
@@ -613,7 +755,10 @@ struct clone_info {
static void dm_bio_destructor(struct bio *bio)
{
struct bio_set *bs = bio->bi_private;
-
+#ifdef DM_DEBUG
+ if (bio_rq_callback(bio))
+ printk(KERN_INFO "%s: bio %p\n", __FUNCTION__, bio);
+#endif
bio_free(bio, bs);
}
@@ -686,6 +831,7 @@ static void __clone_and_map(struct clone_info *ci)
clone = clone_bio(bio, ci->sector, ci->idx,
bio->bi_vcnt - ci->idx, ci->sector_count,
ci->md->bs);
+
__map_bio(ti, clone, tio);
ci->sector_count = 0;
@@ -740,7 +886,6 @@ static void __clone_and_map(struct clone_info *ci)
clone = split_bvec(bio, ci->sector, ci->idx,
bv->bv_offset + offset, len,
ci->md->bs);
-
__map_bio(ti, clone, tio);
ci->sector += len;
@@ -769,8 +914,8 @@ static void __split_bio(struct mapped_device *md, struct bio *bio)
ci.bio = bio;
ci.io = alloc_io(md);
ci.io->error = 0;
- atomic_set(&ci.io->io_count, 1);
- ci.io->bio = bio;
+ atomic_set(&ci.io->parm.io_count, 1);
+ ci.io->data.bio = bio;
ci.io->md = md;
ci.sector = bio->bi_sector;
ci.sector_count = bio_sectors(bio);
@@ -784,6 +929,272 @@ static void __split_bio(struct mapped_device *md, struct bio *bio)
dec_pending(ci.io, 0);
dm_table_put(ci.map);
}
+
+/*
+ * request cloning
+ */
+static void clone_end_request(struct request *rq, int error)
+{
+ struct dm_target_io *tio = (struct dm_target_io *)rq->end_io_data;
+ dm_request_endio_fn endio = tio->ti->type->rq_end_io;
+ struct request_queue *q = rq->q;
+ struct request *orig = tio->io->parm.orig;
+ struct mapped_device *md = tio->io->md;
+ int uptodate;
+
+ /* Safeguard */
+ rq->end_io = NULL;
+ rq->end_io_data = NULL;
+
+ if (endio)
+ endio(tio->ti, rq, error, &tio->info);
+
+ __blk_put_request(q, rq);
+
+ if (orig) {
+ spin_unlock_irq(q->queue_lock);
+ uptodate = (error < 0)? error: !error;
+ end_that_request_last(orig, uptodate);
+ spin_lock_irq(q->queue_lock);
+ }
+
+ dec_rq_pending(tio);
+ free_io(md, tio->io);
+ free_tio(md, tio);
+ free_rq(md, rq);
+}
+
+static int clone_rq_endio(struct bio *bio, unsigned int bytes, int error)
+{
+ struct dm_target_io *tio = (struct dm_target_io *)bio->bi_private;
+ struct request *clone = tio->io->data.req;
+ struct request *orig = tio->io->parm.orig;
+ struct mapped_device *md = tio->io->md;
+ int r = 0, uptodate;
+ dm_request_endio_first_fn endio = tio->ti->type->rq_end_io_first;
+ dm_request_queue_in_tgt_fn queue_in_tgt = tio->ti->type->queue_in_tgt;
+ struct request_queue *q = clone->q, *q_orig = orig->q;
+
+ if (bio->bi_size)
+ return 1;
+
+ if (endio) {
+ r = endio(tio->ti, clone, error, &tio->info);
+ switch (r) {
+ case 0:
+#ifdef DM_DEBUG
+ printk(KERN_INFO "clone %p bio %p endio done\n",
+ clone, bio);
+#endif
+ /* Destroy sequencer bio */
+ bio->bi_private = md->bs;
+ bio_put(bio);
+ /* succeeded */
+ break;
+ case DM_ENDIO_INCOMPLETE:
+ /* FIXME */
+ printk(KERN_INFO "clone %p bio %p endio incomplete\n",
+ clone, bio);
+ /* the target wants to handle the io without unmap */
+ __blk_put_request(q, clone);
+ clone->special = NULL;
+ clone->errors = 0;
+
+ if (!queue_in_tgt) {
+ DMERR("queue_in_tgt isn't implemented.");
+ BUG();
+ }
+ queue_in_tgt(tio->ti, clone, &tio->info);
+ blk_run_queue(q_orig);
+
+ return 0;
+ case DM_ENDIO_REQUEUE:
+ /* FIXME */
+ printk(KERN_INFO "clone %p bio %p endio requeue\n",
+ clone, bio);
+ /*
+ * The target wants to requeue the original request,
+ * unthread sequencer bio and stop bio processing.
+ */
+ clone->bio = NULL;
+ clone->biotail = clone->bio;
+ /* Destroy sequencer bio */
+ bio->bi_private = md->bs;
+ bio_put(bio);
+ /* Requeue original request */
+ dm_requeue_request(q_orig, orig);
+ break;
+ default:
+ /* FIXME */
+ printk(KERN_INFO "clone %p bio %p endio return %d\n",
+ clone, bio, r);
+ if (r >= 0) {
+ DMWARN("unimplemented target endio return value: %d", r);
+ BUG();
+ }
+ /* Destroy sequencer bio */
+ bio->bi_private = md->bs;
+ bio_put(bio);
+ break;
+ }
+ }
+
+ /*
+ * Finish original request
+ */
+ uptodate = (error < 0)? error: !error;
+ r = end_that_request_first(orig, uptodate,
+ orig->hard_cur_sectors);
+ /*
+ * Recopy the original request fields that were updated
+ * in blk_end_request() to the clone.
+ */
+ blk_update_cloned_rq(orig, clone);
+
+ return r?1:0;
+}
+
+static struct bio *alloc_seq_bio(struct request *req, struct bio_set *bs)
+{
+ struct bio *clone;
+
+ clone = bio_alloc_bioset(GFP_NOIO, 0, bs);
+ clone->bi_sector = 0;
+ clone->bi_flags |= 1 << BIO_CLONED;
+ clone->bi_rw |= (1 << BIO_RW_RQCALLBACK);
+ clone->bi_vcnt = 0;
+ clone->bi_size = 0;
+ clone->bi_idx = 0;
+ clone->bi_end_io = clone_rq_endio;
+ bio_phys_segments(req->q, clone);
+ bio_hw_segments(req->q, clone);
+ clone->bi_private = req;
+ clone->bi_destructor = dm_bio_destructor;
+ clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+#ifdef DM_DEBUG
+ printk(KERN_INFO "%s: bio %p\n", __FUNCTION__, clone);
+#endif
+ return clone;
+}
+
+static void setup_clone(struct request *clone, struct request *rq)
+{
+ memset(clone, 0, sizeof(struct request));
+ INIT_LIST_HEAD(&clone->queuelist);
+ INIT_LIST_HEAD(&clone->donelist);
+ clone->cmd_flags = (rq_data_dir(rq) | REQ_NOMERGE);
+ clone->cmd_type = rq->cmd_type;
+ clone->sector = rq->sector;
+ clone->hard_sector = rq->hard_sector;
+ clone->nr_sectors = rq->nr_sectors;
+ clone->hard_nr_sectors = rq->hard_nr_sectors;
+ clone->current_nr_sectors = rq->current_nr_sectors;
+ clone->hard_cur_sectors = rq->hard_cur_sectors;
+ clone->bio = rq->bio;
+ clone->biotail = rq->biotail;
+ INIT_HLIST_NODE(&clone->hash);
+ clone->start_time = jiffies;
+ clone->nr_phys_segments = rq->nr_phys_segments;
+ clone->nr_hw_segments = rq->nr_hw_segments;
+ clone->ioprio = rq->ioprio;
+ clone->buffer = rq->buffer;
+ clone->tag = -1;
+ clone->ref_count = 1;
+ clone->cmd_len = rq->cmd_len;
+ memcpy(clone->cmd, rq->cmd, sizeof(rq->cmd));
+ clone->data_len = rq->data_len;
+ clone->sense_len = rq->sense_len;
+ clone->data = rq->data;
+ clone->sense = rq->sense;
+}
+
+static int clone_and_map_request(struct dm_target *ti, struct request *rq,
+ struct mapped_device *md)
+{
+ int r;
+ struct request *clone;
+ struct dm_target_io *tio;
+
+ tio = alloc_tio(md); /* only one for each original request */
+ if (!tio)
+ /* -ENOMEM, requeue */
+ return 1;
+ tio->io = alloc_io(md);
+ if (!tio->io) {
+ /* -ENOMEM, requeue */
+ free_tio(md, tio);
+ return 1;
+ }
+ tio->io->md = md;
+ tio->io->error = 0;
+ tio->ti = ti;
+ memset(&tio->info, 0, sizeof(tio->info));
+
+ clone = alloc_rq(md);
+ if (!clone) {
+ /* -ENOMEM, requeue */
+ free_io(md, tio->io);
+ free_tio(md, tio);
+ return 1;
+ }
+ setup_clone(clone, rq);
+ clone->bio = alloc_seq_bio(clone, md->bs);
+ clone->end_io = clone_end_request;
+ clone->end_io_data = tio;
+ clone->bio->bi_end_io = clone_rq_endio;
+ clone->bio->bi_private = tio;
+ clone->bio->bi_next = rq->bio;
+ tio->io->data.req = clone;
+ tio->io->parm.orig = rq;
+
+ atomic_inc(&md->pending);
+ r = ti->type->map_rq(ti, clone, &tio->info);
+ switch (r) {
+ case DM_MAPIO_REMAPPED:
+#ifdef DM_DEBUG
+ printk(KERN_INFO "clone %p remapped\n", clone);
+#endif
+ /* the clone has been remapped so dispatch it */
+ dm_dispatch_request(clone->q, clone);
+ break;
+ case DM_MAPIO_SUBMITTED:
+#ifdef DM_DEBUG
+ printk(KERN_INFO "clone %p submitted\n", clone);
+#endif
+ /* the target has taken the request to submit by itself */
+ break;
+ case DM_MAPIO_REQUEUE:
+ printk(KERN_INFO "clone %p removed\n", clone);
+ /* the target has requested to requeue the original request */
+ clone->end_io = NULL;
+ clone->end_io_data = NULL;
+ dec_rq_pending(tio);
+ free_io(md, tio->io);
+ free_tio(md, tio);
+ free_rq(md, clone);
+ printk(KERN_INFO "requeue request %p\n", rq);
+ return 1;
+ default:
+ printk(KERN_INFO "clone %p failure %d\n", clone, r);
+ if (r >= 0) {
+ DMWARN("unimplemented target map return value: %d", r);
+ BUG();
+ }
+ clone->end_io = NULL;
+ clone->end_io_data = NULL;
+
+ dec_rq_pending(tio);
+ free_io(md, tio->io);
+ free_tio(md, tio);
+ free_rq(md, clone);
+ printk(KERN_INFO "kill request %p\n", rq);
+ kill_request(rq);
+ break;
+ }
+
+ return 0;
+}
+
/*-----------------------------------------------------------------
* CRUD END
*---------------------------------------------------------------*/
@@ -844,6 +1255,41 @@ static int dm_request(struct request_queue *q, struct bio *bio)
return 0;
}
+/*
+ * q->request_fn for request based dm.
+ * called with q->queue_lock held
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+ int r;
+ struct mapped_device *md = (struct mapped_device *)q->queuedata;
+ struct dm_table *map = dm_get_table(md);
+ struct dm_target *ti;
+ struct request *rq;
+
+ while (!blk_queue_plugged(q)) {
+ rq = elv_next_request(q);
+ if (!rq)
+ break;
+#ifdef DM_DEBUG
+ printk(KERN_INFO "got request %p\n", rq);
+#endif
+ ti = dm_table_find_target(map, rq->sector);
+
+ blkdev_dequeue_request(rq);
+ spin_unlock_irq(q->queue_lock);
+ r = clone_and_map_request(ti, rq, md);
+ spin_lock_irq(q->queue_lock);
+
+ if (r)
+ __dm_requeue_request(q, rq);
+ }
+
+ dm_table_put(map);
+
+ return;
+}
+
static int dm_flush_all(struct request_queue *q, struct gendisk *disk,
sector_t *error_sector)
{
@@ -870,6 +1316,30 @@ static void dm_unplug_all(struct request_queue *q)
}
}
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+ /*
+ * There is no use in forwarding any barrier request since we can't
+ * guarantee it is (or can be) handled by the targets correctly.
+ */
+ if (unlikely(bio_barrier(bio))) {
+ bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+ return 0;
+ }
+
+ return __make_request(q, bio);
+}
+
+static int dm_prep_rq (struct request_queue *q, struct request *req)
+{
+ struct mapped_device *md = q->queuedata;
+
+ if (test_bit(DMF_BLOCK_IO, &md->flags))
+ return BLKPREP_DEFER;
+
+ return BLKPREP_OK;
+}
+
static int dm_any_congested(void *congested_data, int bdi_bits)
{
int r;
@@ -997,10 +1467,14 @@ static struct mapped_device *alloc_dev(int minor)
atomic_set(&md->open_count, 0);
atomic_set(&md->event_nr, 0);
- md->queue = blk_alloc_queue(GFP_KERNEL);
+ md->queue = blk_init_queue(dm_request_fn, NULL);
if (!md->queue)
goto bad1_free_minor;
+ md->queue->nr_requests = BLKDEV_MAX_RQ;
+ md->queue->nr_batching = 32;
+ md->queue->unplug_thresh = 4;
+ md->queue->unplug_delay = (3 * HZ) / 1000;
md->queue->queuedata = md;
md->queue->backing_dev_info.congested_fn = dm_any_congested;
md->queue->backing_dev_info.congested_data = md;
@@ -1013,9 +1487,13 @@ static struct mapped_device *alloc_dev(int minor)
if (!md->io_pool)
goto bad2;
+ md->rq_pool = mempool_create_slab_pool(MIN_IOS, _rq_cache);
+ if (!md->rq_pool)
+ goto bad3;
+
md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
if (!md->tio_pool)
- goto bad3;
+ goto bad4;
md->bs = bioset_create(16, 16);
if (!md->bs)
@@ -1023,7 +1501,7 @@ static struct mapped_device *alloc_dev(int minor)
md->disk = alloc_disk(1);
if (!md->disk)
- goto bad4;
+ goto bad5;
atomic_set(&md->pending, 0);
init_waitqueue_head(&md->wait);
@@ -1047,10 +1525,12 @@ static struct mapped_device *alloc_dev(int minor)
return md;
- bad4:
+ bad5:
bioset_free(md->bs);
bad_no_bioset:
mempool_destroy(md->tio_pool);
+ bad4:
+ mempool_destroy(md->rq_pool);
bad3:
mempool_destroy(md->io_pool);
bad2:
@@ -1073,6 +1553,7 @@ static void free_dev(struct mapped_device *md)
bdput(md->suspended_bdev);
}
mempool_destroy(md->tio_pool);
+ mempool_destroy(md->rq_pool);
mempool_destroy(md->io_pool);
bioset_free(md->bs);
del_gendisk(md->disk);
@@ -1508,6 +1989,40 @@ out:
return r;
}
+/*
+ * Functions to enable and disable elevator support for
+ * individual devices.
+ */
+int dm_enable_elevator(struct mapped_device *md)
+{
+ if (md->queue->make_request_fn == dm_make_request)
+ return 0;
+
+ blk_queue_make_request(md->queue, dm_make_request);
+ blk_queue_prep_rq(md->queue, dm_prep_rq);
+ md->queue->unplug_fn = generic_unplug_device;
+ md->queue->backing_dev_info.congested_fn = NULL;
+ md->queue->backing_dev_info.congested_data = NULL;
+ set_bit(QUEUE_FLAG_CLUSTER, &md->queue->queue_flags);
+
+ return 0;
+}
+EXPORT_SYMBOL(dm_enable_elevator);
+
+void dm_disable_elevator(struct mapped_device *md)
+{
+ if (md->queue->make_request_fn == dm_request)
+ return;
+
+ blk_queue_make_request(md->queue, dm_request);
+ blk_queue_prep_rq(md->queue, NULL);
+ md->queue->unplug_fn = dm_unplug_all;
+ md->queue->backing_dev_info.congested_fn = dm_any_congested;
+ md->queue->backing_dev_info.congested_data = md;
+ clear_bit(QUEUE_FLAG_CLUSTER, &md->queue->queue_flags);
+}
+EXPORT_SYMBOL(dm_disable_elevator);
+
/*-----------------------------------------------------------------
* Event notification.
*---------------------------------------------------------------*/
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 462ee65..57e7edb 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -112,6 +112,8 @@ int dm_table_resume_targets(struct dm_table *t);
int dm_table_any_congested(struct dm_table *t, int bdi_bits);
void dm_table_unplug_all(struct dm_table *t);
int dm_table_flush_all(struct dm_table *t);
+int dm_table_enable_elevator(struct dm_table *t);
+void dm_table_disable_elevator(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
@@ -182,5 +184,10 @@ void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size);
union map_info *dm_get_mapinfo(struct bio *bio);
int dm_open_count(struct mapped_device *md);
int dm_lock_for_deletion(struct mapped_device *md);
+void dm_dispatch_request(struct request_queue *q, struct request *rq);
+int dm_enable_elevator(struct mapped_device *md);
+void dm_disable_elevator(struct mapped_device *md);
+struct request_queue *dm_get_queue(struct mapped_device *md);
+void dm_put_queue(struct request_queue *q);
#endif
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 1ddef34..cb46e30 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -152,6 +152,7 @@ struct bio {
#define BIO_RW_FAILFAST 3
#define BIO_RW_SYNC 4
#define BIO_RW_META 5
+#define BIO_RW_RQCALLBACK 6 /* Request callback */
/*
* upper 16 bits of bi_rw define the io priority of this bio
@@ -183,6 +184,7 @@ struct bio {
#define bio_failfast(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD))
#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_rq_callback(bio) ((bio)->bi_rw & (1 << BIO_RW_RQCALLBACK))
/*
* will die
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b126c6f..7f72ea6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -647,6 +647,7 @@ extern void register_disk(struct gendisk *dev);
extern void generic_make_request(struct bio *bio);
extern void blk_put_request(struct request *);
extern void __blk_put_request(struct request_queue *, struct request *);
+extern int __make_request(struct request_queue *q, struct bio *bio);
extern void blk_end_sync_rq(struct request *rq, int error);
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 499f537..1e7748a 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -14,6 +14,7 @@ struct dm_target;
struct dm_table;
struct dm_dev;
struct mapped_device;
+struct request;
typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
@@ -45,6 +46,9 @@ typedef void (*dm_dtr_fn) (struct dm_target *ti);
typedef int (*dm_map_fn) (struct dm_target *ti, struct bio *bio,
union map_info *map_context);
+typedef int (*dm_map_request_fn) (struct dm_target *ti, struct request *clone,
+ union map_info *map_context);
+
/*
* Returns:
* < 0 : error (currently ignored)
@@ -57,6 +61,18 @@ typedef int (*dm_endio_fn) (struct dm_target *ti,
struct bio *bio, int error,
union map_info *map_context);
+typedef int (*dm_request_endio_first_fn) (struct dm_target *ti,
+ struct request *clone, int error,
+ union map_info *map_context);
+
+typedef int (*dm_request_endio_fn) (struct dm_target *ti,
+ struct request *clone, int error,
+ union map_info *map_context);
+
+typedef void (*dm_request_queue_in_tgt_fn) (struct dm_target *ti,
+ struct request *clone,
+ union map_info *map_context);
+
typedef void (*dm_flush_fn) (struct dm_target *ti);
typedef void (*dm_presuspend_fn) (struct dm_target *ti);
typedef void (*dm_postsuspend_fn) (struct dm_target *ti);
@@ -98,7 +114,11 @@ struct target_type {
dm_ctr_fn ctr;
dm_dtr_fn dtr;
dm_map_fn map;
+ dm_map_request_fn map_rq;
dm_endio_fn end_io;
+ dm_request_endio_first_fn rq_end_io_first;
+ dm_request_endio_fn rq_end_io;
+ dm_request_queue_in_tgt_fn queue_in_tgt;
dm_flush_fn flush;
dm_presuspend_fn presuspend;
dm_postsuspend_fn postsuspend;