* Alexander Gordeev <agordeev@xxxxxxxxxx> wrote: > The MSI specification has several constraints in comparison with MSI-X, > most notable of them is the inability to configure MSIs independently. > As a result, it is impossible to dispatch interrupts from different > queues to different CPUs. This is largely devalues the support of > multiple MSIs in SMP systems. > > Also, a necessity to allocate a contiguous block of vector numbers for > devices capable of multiple MSIs might cause a considerable pressure on > x86 interrupt vector allocator and could lead to fragmentation of the > interrupt vectors space. > > This patch overcomes both drawbacks in presense of IRQ remapping and > lets devices take advantage of multiple queues and per-IRQ affinity > assignments. > > Signed-off-by: Alexander Gordeev <agordeev@xxxxxxxxxx> > --- > arch/x86/kernel/apic/io_apic.c | 174 +++++++++++++++++++++++++++++++++------ > include/linux/irq.h | 6 ++ > kernel/irq/chip.c | 30 +++++-- > kernel/irq/irqdesc.c | 31 +++++++ > 4 files changed, 206 insertions(+), 35 deletions(-) > > diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c > index c265593..d5cb13c 100644 > --- a/arch/x86/kernel/apic/io_apic.c > +++ b/arch/x86/kernel/apic/io_apic.c > @@ -305,6 +305,11 @@ static int alloc_irq_from(unsigned int from, int node) > return irq_alloc_desc_from(from, node); > } > > +static int alloc_irqs_from(unsigned int from, unsigned int count, int node) > +{ > + return irq_alloc_descs_from(from, count, node); > +} > + > static void free_irq_at(unsigned int at, struct irq_cfg *cfg) > { > free_irq_cfg(at, cfg); > @@ -2991,37 +2996,58 @@ device_initcall(ioapic_init_ops); > /* > * Dynamic irq allocate and deallocation > */ > -unsigned int create_irq_nr(unsigned int from, int node) > +unsigned int __create_irqs(unsigned int from, unsigned int count, int node) > { > - struct irq_cfg *cfg; > + struct irq_cfg **cfg; > unsigned long flags; > - unsigned int ret = 0; > - int irq; > + int irq, i; > > if (from < nr_irqs_gsi) > from = nr_irqs_gsi; > > - irq = alloc_irq_from(from, node); > - if (irq < 0) > - return 0; > - cfg = alloc_irq_cfg(irq, node); > - if (!cfg) { > - free_irq_at(irq, NULL); > + cfg = kzalloc_node(count * sizeof(cfg[0]), GFP_KERNEL, node); > + if (!cfg) > return 0; > + > + irq = alloc_irqs_from(from, count, node); > + if (irq < 0) > + goto out_cfgs; > + > + for (i = 0; i < count; i++) { > + cfg[i] = alloc_irq_cfg(irq + i, node); > + if (!cfg[i]) > + goto out_irqs; > } > > raw_spin_lock_irqsave(&vector_lock, flags); > - if (!__assign_irq_vector(irq, cfg, apic->target_cpus())) > - ret = irq; > + for (i = 0; i < count; i++) > + if (__assign_irq_vector(irq + i, cfg[i], apic->target_cpus())) > + goto out_vecs; > raw_spin_unlock_irqrestore(&vector_lock, flags); > > - if (ret) { > - irq_set_chip_data(irq, cfg); > - irq_clear_status_flags(irq, IRQ_NOREQUEST); > - } else { > - free_irq_at(irq, cfg); > + for (i = 0; i < count; i++) { > + irq_set_chip_data(irq + i, cfg[i]); > + irq_clear_status_flags(irq + i, IRQ_NOREQUEST); > } > - return ret; > + > + kfree(cfg); > + return irq; > + > +out_vecs: > + for (; i; i--) > + __clear_irq_vector(irq + i - 1, cfg[i - 1]); > + raw_spin_unlock_irqrestore(&vector_lock, flags); > +out_irqs: > + for (i = 0; i < count; i++) > + free_irq_at(irq + i, cfg[i]); > +out_cfgs: > + kfree(cfg); > + return 0; > +} > + > +unsigned int create_irq_nr(unsigned int from, int node) > +{ > + return __create_irqs(from, 1, node); > } > > int create_irq(void) > @@ -3054,6 +3080,27 @@ void destroy_irq(unsigned int irq) > free_irq_at(irq, cfg); > } > > +static inline void destroy_irqs(unsigned int irq, unsigned int count) > +{ > + unsigned int i; > + for (i = 0; i < count; i++) Missing newline. > + destroy_irq(irq + i); > +} > + > +static inline int > +can_create_pow_of_two_irqs(unsigned int from, unsigned int count) > +{ > + if ((count > 1) && (count % 2)) > + return -EINVAL; > + > + for (; count; count = count / 2) { > + if (!irq_can_alloc_irqs(from, count)) > + return count; > + } > + > + return -ENOSPC; > +} > + > /* > * MSI message composition > */ > @@ -3145,18 +3192,25 @@ static struct irq_chip msi_chip = { > .irq_retrigger = ioapic_retrigger_irq, > }; > > -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) > +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, > + unsigned int irq_base, unsigned int irq_offset) > { > struct irq_chip *chip = &msi_chip; > struct msi_msg msg; > + unsigned int irq = irq_base + irq_offset; > int ret; > > ret = msi_compose_msg(dev, irq, &msg, -1); > if (ret < 0) > return ret; > > - irq_set_msi_desc(irq, msidesc); > - write_msi_msg(irq, &msg); > + irq_set_msi_desc_off(irq_base, irq_offset, msidesc); > + > + /* MSI-X message is written per-IRQ, the offset is always 0. > + * MSI message denotes a contiguous group of IRQs, written for 0th IRQ. > + */ Please use the customary (multi-line) comment style: /* * Comment ..... * ...... goes here. */ specified in Documentation/CodingStyle. > + if (!irq_offset) > + write_msi_msg(irq, &msg); > > if (irq_remapped(irq_get_chip_data(irq))) { > irq_set_status_flags(irq, IRQ_MOVE_PCNTXT); > @@ -3170,16 +3224,12 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) > return 0; > } > > -int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) > +int setup_msix_irqs(struct pci_dev *dev, int nvec) > { > int node, ret, sub_handle, index = 0; > unsigned int irq, irq_want; > struct msi_desc *msidesc; > > - /* x86 doesn't support multiple MSI yet */ > - if (type == PCI_CAP_ID_MSI && nvec > 1) > - return 1; > - > node = dev_to_node(&dev->dev); > irq_want = nr_irqs_gsi; > sub_handle = 0; > @@ -3208,7 +3258,7 @@ int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) > goto error; > } > no_ir: > - ret = setup_msi_irq(dev, msidesc, irq); > + ret = setup_msi_irq(dev, msidesc, irq, 0); > if (ret < 0) > goto error; > sub_handle++; > @@ -3220,6 +3270,76 @@ error: > return ret; > } > > +int setup_msi_irqs(struct pci_dev *dev, int nvec) > +{ > + int node, ret, sub_handle, index = 0; > + unsigned int irq; > + struct msi_desc *msidesc; > + > + if (nvec > 1 && !irq_remapping_enabled) > + return 1; > + > + nvec = __roundup_pow_of_two(nvec); > + ret = can_create_pow_of_two_irqs(nr_irqs_gsi, nvec); > + if (ret != nvec) > + return ret; > + > + WARN_ON(!list_is_singular(&dev->msi_list)); > + msidesc = list_entry(dev->msi_list.next, struct msi_desc, list); > + WARN_ON(msidesc->irq); > + WARN_ON(msidesc->msi_attrib.multiple); > + > + node = dev_to_node(&dev->dev); > + irq = __create_irqs(nr_irqs_gsi, nvec, node); > + if (irq == 0) > + return -ENOSPC; > + > + if (!irq_remapping_enabled) { > + ret = setup_msi_irq(dev, msidesc, irq, 0); > + if (ret < 0) > + goto error; > + return 0; > + } > + > + msidesc->msi_attrib.multiple = ilog2(nvec); > + for (sub_handle = 0; sub_handle < nvec; sub_handle++) { > + if (!sub_handle) { > + index = msi_alloc_remapped_irq(dev, irq, nvec); > + if (index < 0) { > + ret = index; > + goto error; > + } > + } else { > + ret = msi_setup_remapped_irq(dev, irq + sub_handle, > + index, sub_handle); > + if (ret < 0) > + goto error; > + } > + ret = setup_msi_irq(dev, msidesc, irq, sub_handle); > + if (ret < 0) > + goto error; > + } > + return 0; > + > +error: > + destroy_irqs(irq, nvec); > + > + /* Restore altered MSI descriptor fields and prevent just destroyed > + * IRQs from tearing down again in default_teardown_msi_irqs() > + */ Ditto. > + msidesc->irq = 0; > + msidesc->msi_attrib.multiple = 0; > + > + return ret; > +} > + > +int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) > +{ > + if (type == PCI_CAP_ID_MSI) > + return setup_msi_irqs(dev, nvec); > + return setup_msix_irqs(dev, nvec); > +} > + > void native_teardown_msi_irq(unsigned int irq) > { > destroy_irq(irq); > diff --git a/include/linux/irq.h b/include/linux/irq.h > index 216b0ba..c3ba39f 100644 > --- a/include/linux/irq.h > +++ b/include/linux/irq.h > @@ -522,6 +522,8 @@ extern int irq_set_handler_data(unsigned int irq, void *data); > extern int irq_set_chip_data(unsigned int irq, void *data); > extern int irq_set_irq_type(unsigned int irq, unsigned int type); > extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); > +extern int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, > + struct msi_desc *entry); > extern struct irq_data *irq_get_irq_data(unsigned int irq); > > static inline struct irq_chip *irq_get_chip(unsigned int irq) > @@ -584,8 +586,12 @@ int __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node, > #define irq_alloc_desc_from(from, node) \ > irq_alloc_descs(-1, from, 1, node) > > +#define irq_alloc_descs_from(from, cnt, node) \ > + irq_alloc_descs(-1, from, cnt, node) > + Please use inlines instead of macros. Might transform the one above it as well in the process. > void irq_free_descs(unsigned int irq, unsigned int cnt); > int irq_reserve_irqs(unsigned int from, unsigned int cnt); > +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt); > > static inline void irq_free_desc(unsigned int irq) > { > diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c > index 57d86d0..2230389 100644 > --- a/kernel/irq/chip.c > +++ b/kernel/irq/chip.c > @@ -90,27 +90,41 @@ int irq_set_handler_data(unsigned int irq, void *data) > EXPORT_SYMBOL(irq_set_handler_data); > > /** > - * irq_set_msi_desc - set MSI descriptor data for an irq > - * @irq: Interrupt number > - * @entry: Pointer to MSI descriptor data > + * irq_set_msi_desc_off - set MSI descriptor data for an irq at offset > + * @irq_base: Interrupt number base > + * @irq_offset: Interrupt number offset > + * @entry: Pointer to MSI descriptor data > * > - * Set the MSI descriptor entry for an irq > + * Set the MSI descriptor entry for an irq at offset > */ > -int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) > +int irq_set_msi_desc_off(unsigned int irq_base, unsigned int irq_offset, > + struct msi_desc *entry) > { > unsigned long flags; > - struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL); > + struct irq_desc *desc = irq_get_desc_lock(irq_base + irq_offset, &flags, IRQ_GET_DESC_CHECK_GLOBAL); > > if (!desc) > return -EINVAL; > desc->irq_data.msi_desc = entry; > - if (entry) > - entry->irq = irq; > + if (entry && !irq_offset) > + entry->irq = irq_base; > irq_put_desc_unlock(desc, flags); > return 0; > } > > /** > + * irq_set_msi_desc - set MSI descriptor data for an irq > + * @irq: Interrupt number > + * @entry: Pointer to MSI descriptor data > + * > + * Set the MSI descriptor entry for an irq > + */ > +int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry) > +{ > + return irq_set_msi_desc_off(irq, 0, entry); > +} > + > +/** > * irq_set_chip_data - set irq chip data for an irq > * @irq: Interrupt number > * @data: Pointer to chip specific data > diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c > index 192a302..8287b78 100644 > --- a/kernel/irq/irqdesc.c > +++ b/kernel/irq/irqdesc.c > @@ -210,6 +210,13 @@ static int irq_expand_nr_irqs(unsigned int nr) > return 0; > } > > +static int irq_can_expand_nr_irqs(unsigned int nr) > +{ > + if (nr > IRQ_BITMAP_BITS) > + return -ENOMEM; > + return 0; > +} > + > int __init early_irq_init(void) > { > int i, initcnt, node = first_online_node; > @@ -414,6 +421,30 @@ int irq_reserve_irqs(unsigned int from, unsigned int cnt) > } > > /** > + * irq_can_alloc_irqs - checks if a range of irqs could be allocated > + * @from: check from irq number > + * @cnt: number of irqs to check > + * > + * Returns 0 on success or an appropriate error code > + */ > +int irq_can_alloc_irqs(unsigned int from, unsigned int cnt) > +{ > + unsigned int start; > + int ret = 0; > + > + if (!cnt) > + return -EINVAL; > + > + mutex_lock(&sparse_irq_lock); > + start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS, > + from, cnt, 0); > + mutex_unlock(&sparse_irq_lock); > + if (start + cnt > nr_irqs) > + ret = irq_can_expand_nr_irqs(start + cnt); > + return ret; How is this supposed to work wrt. races? Thanks, Ingo -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html