Implement the arch_setup_msi_irqs() interface. Extend create_irq() into create_irq_block() and reimplement create_irq as a wrapper around it. Create assign_irq_vector_block() based closely on assign_irq_vector(). Teach set_msi_irq_affinity() how to handle multiple MSIs. Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> --- arch/x86/kernel/io_apic_64.c | 237 ++++++++++++++++++++++++++++++++++++------ 1 files changed, 205 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8df..6a00dca 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -61,7 +61,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { +static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +static int current_vector = FIRST_DEVICE_VECTOR; + static int __assign_irq_vector(int irq, cpumask_t mask) { /* @@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + static int current_offset = 0; unsigned int old_vector; int cpu; struct irq_cfg *cfg; @@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask) return err; } +static int __assign_irq_vector_block(int irq, int count, cpumask_t mask) +{ + unsigned int old_vector; + int i, cpu; + struct irq_cfg *cfg; + + /* + * We've got to be careful not to trash gate 0x80, + * because int 0x80 is hm, kind of importantish. ;) + */ + BUG_ON((unsigned)irq + count > NR_IRQS); + + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); + + for (i = 0; i < count; i++) { + cfg = &irq_cfg[irq + i]; + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; + } + + cfg = &irq_cfg[irq]; + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector & ~(count - 1); + next: + vector += count; + if (vector + count >= FIRST_SYSTEM_VECTOR) { + vector = FIRST_DEVICE_VECTOR & ~(count - 1); + if (vector < FIRST_DEVICE_VECTOR) + vector += count; + } + if (unlikely(vector == (current_vector & ~(count - 1)))) + continue; + if ((IA32_SYSCALL_VECTOR >= vector) && + (IA32_SYSCALL_VECTOR < vector + count)) + goto next; + for_each_cpu_mask(new_cpu, new_mask) { + for (i = 0; i < count; i++) { + if (per_cpu(vector_irq, new_cpu)[vector + i] + != -1) + goto next; + } + } + /* Found one! */ + current_vector = vector + count - 1; + for (i = 0; i < count; i++) { + cfg = &irq_cfg[irq + i]; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask(new_cpu, new_mask) { + per_cpu(vector_irq, new_cpu)[vector + i] = + irq + i; + } + cfg->vector = vector; + cfg->domain = domain; + } + return 0; + } + return -ENOSPC; +} + +/* Assumes that count is a power of two and aligns to that power of two */ +static int assign_irq_vector_block(int irq, int count, cpumask_t mask) +{ + int result; + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + result = __assign_irq_vector_block(irq, count, mask); + spin_unlock_irqrestore(&vector_lock, flags); + + return result; +} + static void __clear_irq_vector(int irq) { struct irq_cfg *cfg; @@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq) cpus_clear(cfg->domain); } +static void __clear_irq_vector_block(int irq, int count) +{ + while (count > 0) { + count--; + __clear_irq_vector(irq + count); + } +} + void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ @@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) + +/* + * On success, returns the interrupt number of the lowest numbered irq + * in the block. If it can't find a block of the right size, it returns + * -1 - (length of the longest run). + */ +static int create_irq_block(int count) { - /* Allocate an unused irq */ - int irq; - int new; + /* Allocate 'count' consecutive unused irqs */ + int i, new, longest; unsigned long flags; - irq = -ENOSPC; + longest = 0; spin_lock_irqsave(&vector_lock, flags); for (new = (NR_IRQS - 1); new >= 0; new--) { if (platform_legacy_irq(new)) - continue; + goto clear; if (irq_cfg[new].vector != 0) + goto clear; + longest++; + if (longest < count) continue; - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; + + while (__assign_irq_vector_block(new, longest, TARGET_CPUS)) + longest /= 2; + if (longest < count) + __clear_irq_vector_block(new, longest); break; + clear: + __clear_irq_vector_block(new + 1, longest); + longest = 0; } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { - dynamic_irq_init(irq); + if (longest < count) + return -1 - longest; + + for (i = 0; i < count; i++) { + dynamic_irq_init(new + i); } - return irq; + + return new; +} + +int create_irq(void) +{ + int ret = create_irq_block(1); + if (ret < 0) + return -ENOSPC; + return ret; } void destroy_irq(unsigned int irq) @@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq) * MSI message composition */ #ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, + unsigned int count, struct msi_msg *msg) { struct irq_cfg *cfg = irq_cfg + irq; int err; @@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms cpumask_t tmp; tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); + if (count == 1) + err = assign_irq_vector(irq, tmp); + else + err = assign_irq_vector_block(irq, count, tmp); if (!err) { cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -1975,6 +2106,8 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg + irq; + struct msi_desc *desc = get_irq_msi(irq); + int i, count = 1 << desc->msi_attrib.multiple; struct msi_msg msg; unsigned int dest; cpumask_t tmp; @@ -1983,8 +2116,15 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (cpus_empty(tmp)) return; - if (assign_irq_vector(irq, mask)) - return; + if (count > 1) { + /* Multiple MSIs all go to the same destination */ + irq = desc->irq; + if (assign_irq_vector_block(irq, count, mask)) + return; + } else { + if (assign_irq_vector(irq, mask)) + return; + } cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -1997,7 +2137,9 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + + for (i = 0; i < count; i++) + irq_desc[irq + i].affinity = mask; } #endif /* CONFIG_SMP */ @@ -2016,28 +2158,59 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +static int x86_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc, int count) { struct msi_msg msg; - int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) { - destroy_irq(irq); - return ret; + int i, ret, base_irq, alloc; + + /* MSI can only allocate a power-of-two */ + alloc = roundup_pow_of_two(count); + + base_irq = create_irq_block(alloc); + if (base_irq < 0) { + if (alloc == 1) + return -ENOSPC; + return rounddown_pow_of_two(-base_irq - 1); } - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); + ret = msi_compose_msg(pdev, base_irq, alloc, &msg); + if (ret) + return ret; + + desc->msi_attrib.multiple = order_base_2(alloc); - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + /* Do loop in reverse so set_irq_msi ends up setting + * desc->irq to base_irq + */ + for (i = count - 1; i >= 0; i--) { + set_irq_msi(base_irq + i, desc); + set_irq_chip_and_handler_name(base_irq + i, &msi_chip, + handle_edge_irq, "edge"); + } + write_msi_msg(base_irq, &msg); return 0; } +int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct msi_desc *desc; + int ret; + + if (type == PCI_CAP_ID_MSI) { + desc = list_first_entry(&pdev->msi_list, struct msi_desc, list); + ret = x86_setup_msi_irq(pdev, desc, nvec); + } else { + list_for_each_entry(desc, &pdev->msi_list, list) { + ret = x86_setup_msi_irq(pdev, desc, 1); + if (ret) + break; + } + } + + return ret; +} + void arch_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); @@ -2090,7 +2263,7 @@ int arch_setup_dmar_msi(unsigned int irq) int ret; struct msi_msg msg; - ret = msi_compose_msg(NULL, irq, &msg); + ret = msi_compose_msg(NULL, irq, 1, &msg); if (ret < 0) return ret; dmar_msi_write(irq, &msg); -- 1.5.5.4 -- To unsubscribe from this list: send the line "unsubscribe linux-pci" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html