On Mon, Jul 31, 2017 at 03:24:40PM -0700, Dave Jiang wrote: > Adding blk-mq support to the pmem driver in addition to the direct bio > support. This allows for hardware offloading via DMA engines. By default > the bio method will be enabled. The blk-mq support can be turned on via > module parameter queue_mode=1. > > Signed-off-by: Dave Jiang <dave.jiang@xxxxxxxxx> > --- > drivers/nvdimm/pmem.c | 131 +++++++++++++++++++++++++++++++++++++++++++++---- > drivers/nvdimm/pmem.h | 3 + > 2 files changed, 122 insertions(+), 12 deletions(-) > > diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c > index c544d46..347835b 100644 > --- a/drivers/nvdimm/pmem.c > +++ b/drivers/nvdimm/pmem.c > @@ -31,10 +31,24 @@ > #include <linux/pmem.h> > #include <linux/dax.h> > #include <linux/nd.h> > +#include <linux/blk-mq.h> > #include "pmem.h" > #include "pfn.h" > #include "nd.h" > > +enum { > + PMEM_Q_BIO = 0, > + PMEM_Q_MQ = 1, > +}; > + > +static int queue_mode = PMEM_Q_BIO; > +module_param(queue_mode, int, 0444); > +MODULE_PARM_DESC(queue_mode, "Pmem Queue Mode"); Maybe in the parameter description we can give the user a hint as to what the settings are? i.e. "PMEM queue mode (0=BIO, 1=MQ+DMA)" or something? > + > +struct pmem_cmd { > + struct request *rq; > +}; > + > static struct device *to_dev(struct pmem_device *pmem) > { > /* > @@ -239,9 +253,12 @@ static const struct dax_operations pmem_dax_ops = { > .direct_access = pmem_dax_direct_access, > }; > > -static void pmem_release_queue(void *q) > +static void pmem_release_queue(void *pmem) > { > - blk_cleanup_queue(q); > + Unneeded whitespace. Also, a nit, can we maybe add a local pmem with type "struct pmem_device *" so we don't have to cast twice? > + blk_cleanup_queue(((struct pmem_device *)pmem)->q); > + if (queue_mode == PMEM_Q_MQ) > + blk_mq_free_tag_set(&((struct pmem_device *)pmem)->tag_set); > } > > static void pmem_freeze_queue(void *q) > @@ -259,6 +276,65 @@ static void pmem_release_disk(void *__pmem) > put_disk(pmem->disk); > } > > +static int pmem_handle_cmd(struct pmem_cmd *cmd) > +{ > + struct request *req = cmd->rq; > + struct request_queue *q = req->q; > + struct pmem_device *pmem = q->queuedata; > + struct nd_region *nd_region = to_region(pmem); > + struct bio_vec bvec; > + struct req_iterator iter; > + int rc = 0; > + > + if (req->cmd_flags & REQ_FLUSH) > + nvdimm_flush(nd_region); > + > + rq_for_each_segment(bvec, req, iter) { > + rc = pmem_do_bvec(pmem, bvec.bv_page, bvec.bv_len, > + bvec.bv_offset, op_is_write(req_op(req)), > + iter.iter.bi_sector); > + if (rc < 0) > + break; > + } > + > + if (req->cmd_flags & REQ_FUA) > + nvdimm_flush(nd_region); > + > + blk_mq_end_request(cmd->rq, rc); > + > + return 0; return rc; ? > +} > + > +static int pmem_queue_rq(struct blk_mq_hw_ctx *hctx, > + const struct blk_mq_queue_data *bd) > +{ > + struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); > + int rc; > + > + cmd->rq = bd->rq; > + > + blk_mq_start_request(bd->rq); > + > + rc = pmem_handle_cmd(cmd); > + if (rc < 0) > + rc = BLK_MQ_RQ_QUEUE_ERROR; > + else > + rc = BLK_MQ_RQ_QUEUE_OK; > + > + return rc; > +} You can get rid of 'rc' altogether and simplify this to: static int pmem_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct pmem_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); cmd->rq = bd->rq; blk_mq_start_request(bd->rq); if (pmem_handle_cmd(cmd) < 0) return BLK_MQ_RQ_QUEUE_ERROR; else return BLK_MQ_RQ_QUEUE_OK; } > +static int pmem_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, > + unsigned int index) > +{ > + return 0; > +} I'm pretty sure you don't need a dummy implementation of .init_hctx. All callers check to make sure it's defined before calling, so if you just leave the OP NULL it'll do the same thing. scsi_mq_ops for example does this. > @@ -303,17 +380,47 @@ static int pmem_attach_disk(struct device *dev, > return -EBUSY; > } > > - q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); > - if (!q) > - return -ENOMEM; > + if (queue_mode == PMEM_Q_MQ) { > + pmem->tag_set.ops = &pmem_mq_ops; > + pmem->tag_set.nr_hw_queues = nr_online_nodes; > + pmem->tag_set.queue_depth = 64; > + pmem->tag_set.numa_node = dev_to_node(dev); > + pmem->tag_set.cmd_size = sizeof(struct pmem_cmd); > + pmem->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; > + pmem->tag_set.driver_data = pmem; > + > + rc = blk_mq_alloc_tag_set(&pmem->tag_set); > + if (rc < 0) > + return rc; > + > + pmem->q = blk_mq_init_queue(&pmem->tag_set); > + if (IS_ERR(pmem->q)) { > + blk_mq_free_tag_set(&pmem->tag_set); > + return -ENOMEM; > + } > > - if (devm_add_action_or_reset(dev, pmem_release_queue, q)) > - return -ENOMEM; > + if (devm_add_action_or_reset(dev, pmem_release_queue, pmem)) { > + blk_mq_free_tag_set(&pmem->tag_set); > + return -ENOMEM; > + } > + } else if (queue_mode == PMEM_Q_BIO) { > + pmem->q = blk_alloc_queue_node(GFP_KERNEL, dev_to_node(dev)); > + if (!pmem->q) > + return -ENOMEM; > + > + if (devm_add_action_or_reset(dev, pmem_release_queue, pmem)) > + return -ENOMEM; > + > + blk_queue_make_request(pmem->q, pmem_make_request); Later on in this function there is a big block that is still manipulating 'q' instead of pmem->q: blk_queue_write_cache(q, true, true); blk_queue_make_request(q, pmem_make_request); blk_queue_physical_block_size(q, PAGE_SIZE); blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); queue_flag_set_unlocked(QUEUE_FLAG_DAX, q); q->queuedata = pmem; I'm pretty sure this all needs to be looking at pmem->q instead, and we may just end up killing the local 'q'. Also, this block duplicates the blk_queue_make_request() call, which we manually make happen in the PMEM_Q_BIO case above. Not sure if this needs to be common for both that and the PMEM_Q_MQ case, but we probably don't want it in both places. -- To unsubscribe from this list: send the line "unsubscribe dmaengine" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html