Please add linux-nvme@xxxxxxxxxxxxxxxxxxx to this list and Cc maintainers of NVMe subsystems. Also do you have performance numbers for WRR ? On 06/16/2019 03:16 AM, Weiping Zhang wrote: > Now nvme support five types hardware queue: > poll: if io was marked for poll > wrr_low: weighted round robin low > wrr_medium: weighted round robin medium > wrr_high: weighted round robin high > read: for read, if blkcg's wrr is none and is not poll > defaut: for write/flush, if blkcg's wrr is none and is not poll > > for read, default and poll those submission queue's priority is medium by default; > > Signed-off-by: Weiping Zhang <zhangweiping@xxxxxxxxxxxxxx> > --- > drivers/nvme/host/pci.c | 195 +++++++++++++++++++++++++++++++++------------- > include/linux/interrupt.h | 2 +- > 2 files changed, 144 insertions(+), 53 deletions(-) > > diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c > index f562154551ce..ee9f3239f3e7 100644 > --- a/drivers/nvme/host/pci.c > +++ b/drivers/nvme/host/pci.c > @@ -73,16 +73,28 @@ static const struct kernel_param_ops queue_count_ops = { > .get = param_get_int, > }; > > -static int write_queues; > -module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644); > -MODULE_PARM_DESC(write_queues, > - "Number of queues to use for writes. If not set, reads and writes " > +static int read_queues; > +module_param_cb(read_queues, &queue_count_ops, &read_queues, 0644); > +MODULE_PARM_DESC(read_queues, > + "Number of queues to use for reads. If not set, reads and writes " > "will share a queue set."); > > static int poll_queues = 0; > module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644); > MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO."); > > +static int wrr_high_queues = 0; > +module_param_cb(wrr_high_queues, &queue_count_ops, &wrr_high_queues, 0644); > +MODULE_PARM_DESC(wrr_high_queues, "Number of queues to use for WRR high."); > + > +static int wrr_medium_queues = 0; > +module_param_cb(wrr_medium_queues, &queue_count_ops, &wrr_medium_queues, 0644); > +MODULE_PARM_DESC(wrr_medium_queues, "Number of queues to use for WRR medium."); > + > +static int wrr_low_queues = 0; > +module_param_cb(wrr_low_queues, &queue_count_ops, &wrr_low_queues, 0644); > +MODULE_PARM_DESC(wrr_low_queues, "Number of queues to use for WRR low."); > + > struct nvme_dev; > struct nvme_queue; > > @@ -226,9 +238,17 @@ struct nvme_iod { > struct scatterlist *sg; > }; > > +static inline bool nvme_is_enable_wrr(struct nvme_dev *dev) > +{ > + return dev->io_queues[HCTX_TYPE_WRR_LOW] + > + dev->io_queues[HCTX_TYPE_WRR_MEDIUM] + > + dev->io_queues[HCTX_TYPE_WRR_HIGH] > 0; > +} > + > static unsigned int max_io_queues(void) > { > - return num_possible_cpus() + write_queues + poll_queues; > + return num_possible_cpus() + read_queues + poll_queues + > + wrr_high_queues + wrr_medium_queues + wrr_low_queues; > } > > static unsigned int max_queue_count(void) > @@ -1156,19 +1176,23 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, > } > > static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, > - struct nvme_queue *nvmeq) > + struct nvme_queue *nvmeq, int wrr) > { > struct nvme_ctrl *ctrl = &dev->ctrl; > struct nvme_command c; > int flags = NVME_QUEUE_PHYS_CONTIG; > > - /* > - * Some drives have a bug that auto-enables WRRU if MEDIUM isn't > - * set. Since URGENT priority is zeroes, it makes all queues > - * URGENT. > - */ > - if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) > - flags |= NVME_SQ_PRIO_MEDIUM; > + if (!nvme_is_enable_wrr(dev)) { > + /* > + * Some drives have a bug that auto-enables WRRU if MEDIUM isn't > + * set. Since URGENT priority is zeroes, it makes all queues > + * URGENT. > + */ > + if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ) > + flags |= NVME_SQ_PRIO_MEDIUM; > + } else { > + flags |= wrr; > + } > > /* > * Note: we (ab)use the fact that the prp fields survive if no data > @@ -1534,11 +1558,46 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) > wmb(); /* ensure the first interrupt sees the initialization */ > } > > -static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) > +static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) > { > struct nvme_dev *dev = nvmeq->dev; > - int result; > + int start, end, result, wrr; > + bool polled = false; > u16 vector = 0; > + enum hctx_type type; > + > + /* 0 for admain queue, io queue index >= 1 */ > + start = 1; > + /* get hardware context type base on qid */ > + for (type = HCTX_TYPE_DEFAULT; type < HCTX_MAX_TYPES; type++) { > + end = start + dev->io_queues[type] - 1; > + if (qid >= start && qid <= end) > + break; > + start = end + 1; > + } > + > + if (nvme_is_enable_wrr(dev)) { > + /* set read,poll,default to medium by default */ > + switch (type) { > + case HCTX_TYPE_POLL: > + polled = true; > + case HCTX_TYPE_DEFAULT: > + case HCTX_TYPE_READ: > + case HCTX_TYPE_WRR_MEDIUM: > + wrr = NVME_SQ_PRIO_MEDIUM; > + break; > + case HCTX_TYPE_WRR_LOW: > + wrr = NVME_SQ_PRIO_LOW; > + break; > + case HCTX_TYPE_WRR_HIGH: > + wrr = NVME_SQ_PRIO_HIGH; > + break; > + default: > + return -EINVAL; > + } > + } else { > + wrr = 0; > + } > > clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags); > > @@ -1555,7 +1614,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) > if (result) > return result; > > - result = adapter_alloc_sq(dev, qid, nvmeq); > + result = adapter_alloc_sq(dev, qid, nvmeq, wrr); > if (result < 0) > return result; > else if (result) > @@ -1726,7 +1785,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) > > static int nvme_create_io_queues(struct nvme_dev *dev) > { > - unsigned i, max, rw_queues; > + unsigned i, max; > int ret = 0; > > for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { > @@ -1737,17 +1796,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev) > } > > max = min(dev->max_qid, dev->ctrl.queue_count - 1); > - if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) { > - rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] + > - dev->io_queues[HCTX_TYPE_READ]; > - } else { > - rw_queues = max; > - } > > for (i = dev->online_queues; i <= max; i++) { > - bool polled = i > rw_queues; > - > - ret = nvme_create_queue(&dev->queues[i], i, polled); > + ret = nvme_create_queue(&dev->queues[i], i); > if (ret) > break; > } > @@ -2028,35 +2079,73 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) > static void nvme_calc_irq_sets(struct irq_affinity *affd, unsigned int nrirqs) > { > struct nvme_dev *dev = affd->priv; > - unsigned int nr_read_queues; > + unsigned int nr_total, nr, nr_read, nr_default; > + unsigned int nr_wrr_high, nr_wrr_medium, nr_wrr_low; > + unsigned int nr_sets; > > /* > * If there is no interupt available for queues, ensure that > * the default queue is set to 1. The affinity set size is > * also set to one, but the irq core ignores it for this case. > - * > - * If only one interrupt is available or 'write_queue' == 0, combine > - * write and read queues. > - * > - * If 'write_queues' > 0, ensure it leaves room for at least one read > - * queue. > */ > - if (!nrirqs) { > + if (!nrirqs) > nrirqs = 1; > - nr_read_queues = 0; > - } else if (nrirqs == 1 || !write_queues) { > - nr_read_queues = 0; > - } else if (write_queues >= nrirqs) { > - nr_read_queues = 1; > - } else { > - nr_read_queues = nrirqs - write_queues; > - } > > - dev->io_queues[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; > - affd->set_size[HCTX_TYPE_DEFAULT] = nrirqs - nr_read_queues; > - dev->io_queues[HCTX_TYPE_READ] = nr_read_queues; > - affd->set_size[HCTX_TYPE_READ] = nr_read_queues; > - affd->nr_sets = nr_read_queues ? 2 : 1; > + nr_total = nrirqs; > + > + nr_read = nr_wrr_high = nr_wrr_medium = nr_wrr_low = 0; > + > + /* set default to 1, add all the rest queue to default at last */ > + nr = nr_default = 1; > + nr_sets = 1; > + > + nr_total -= nr; > + if (!nr_total) > + goto done; > + > + /* read queues */ > + nr_sets++; > + nr_read = nr = read_queues > nr_total ? nr_total : read_queues; > + nr_total -= nr; > + if (!nr_total) > + goto done; > + > + /* wrr low queues */ > + nr_sets++; > + nr_wrr_low = nr = wrr_low_queues > nr_total ? nr_total : wrr_low_queues; > + nr_total -= nr; > + if (!nr_total) > + goto done; > + > + /* wrr medium queues */ > + nr_sets++; > + nr_wrr_medium = nr = wrr_medium_queues > nr_total ? nr_total : wrr_medium_queues; > + nr_total -= nr; > + if (!nr_total) > + goto done; > + > + /* wrr high queues */ > + nr_sets++; > + nr_wrr_high = nr = wrr_high_queues > nr_total ? nr_total : wrr_high_queues; > + nr_total -= nr; > + if (!nr_total) > + goto done; > + > + /* set all the rest queue to default */ > + nr_default += nr_total; > + > +done: > + dev->io_queues[HCTX_TYPE_DEFAULT] = nr_default; > + affd->set_size[HCTX_TYPE_DEFAULT] = nr_default; > + dev->io_queues[HCTX_TYPE_READ] = nr_read; > + affd->set_size[HCTX_TYPE_READ] = nr_read; > + dev->io_queues[HCTX_TYPE_WRR_LOW] = nr_wrr_low; > + affd->set_size[HCTX_TYPE_WRR_LOW] = nr_wrr_low; > + dev->io_queues[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium; > + affd->set_size[HCTX_TYPE_WRR_MEDIUM] = nr_wrr_medium; > + dev->io_queues[HCTX_TYPE_WRR_HIGH] = nr_wrr_high; > + affd->set_size[HCTX_TYPE_WRR_HIGH] = nr_wrr_high; > + affd->nr_sets = nr_sets; > } > > static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) > @@ -2171,10 +2260,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) > nvme_suspend_io_queues(dev); > goto retry; > } > - dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n", > + dev_info(dev->ctrl.device, "%d/%d/%d/%d/%d/%d " > + "default/read/poll/wrr_low/wrr_medium/wrr_high queues\n", > dev->io_queues[HCTX_TYPE_DEFAULT], > dev->io_queues[HCTX_TYPE_READ], > - dev->io_queues[HCTX_TYPE_POLL]); > + dev->io_queues[HCTX_TYPE_POLL], > + dev->io_queues[HCTX_TYPE_WRR_LOW], > + dev->io_queues[HCTX_TYPE_WRR_MEDIUM], > + dev->io_queues[HCTX_TYPE_WRR_HIGH]); > return 0; > } > > @@ -2263,9 +2356,7 @@ static int nvme_dev_add(struct nvme_dev *dev) > if (!dev->ctrl.tagset) { > dev->tagset.ops = &nvme_mq_ops; > dev->tagset.nr_hw_queues = dev->online_queues - 1; > - dev->tagset.nr_maps = 2; /* default + read */ > - if (dev->io_queues[HCTX_TYPE_POLL]) > - dev->tagset.nr_maps++; > + dev->tagset.nr_maps = HCTX_MAX_TYPES; > dev->tagset.timeout = NVME_IO_TIMEOUT; > dev->tagset.numa_node = dev_to_node(dev->dev); > dev->tagset.queue_depth = > diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h > index c7eef32e7739..ea726c2f95cc 100644 > --- a/include/linux/interrupt.h > +++ b/include/linux/interrupt.h > @@ -259,7 +259,7 @@ struct irq_affinity_notify { > void (*release)(struct kref *ref); > }; > > -#define IRQ_AFFINITY_MAX_SETS 4 > +#define IRQ_AFFINITY_MAX_SETS 7 > > /** > * struct irq_affinity - Description for automatic irq affinity assignements >