[PATCH] x86-64: Support for multiple MSIs

Matthew Wilcox <matthew@xxxxxx> · Thu, 10 Jul 2008 21:00:02 -0400

Implement the arch_setup_msi_irqs() interface.  Extend create_irq()
into create_irq_block() and reimplement create_irq as a wrapper around
it.  Create assign_irq_vector_block() based closely on
assign_irq_vector().  Teach set_msi_irq_affinity() how to handle
multiple MSIs.

Signed-off-by: Matthew Wilcox <willy@xxxxxxxxxxxxxxx>
---
 arch/x86/kernel/io_apic_64.c |  237 ++++++++++++++++++++++++++++++++++++------
 1 files changed, 205 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8df..6a00dca 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -61,7 +61,7 @@ struct irq_cfg {
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
-struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
+static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
 	[0]  = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR,  },
 	[1]  = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR,  },
 	[2]  = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR,  },
@@ -683,6 +683,8 @@ static int pin_2_irq(int idx, int apic, int pin)
 	return irq;
 }
 
+static int current_vector = FIRST_DEVICE_VECTOR;
+
 static int __assign_irq_vector(int irq, cpumask_t mask)
 {
 	/*
@@ -696,7 +698,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
 	 * Also, we've got to be careful not to trash gate
 	 * 0x80, because int 0x80 is hm, kind of importantish. ;)
 	 */
-	static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0;
+	static int current_offset = 0;
 	unsigned int old_vector;
 	int cpu;
 	struct irq_cfg *cfg;
@@ -769,6 +771,97 @@ static int assign_irq_vector(int irq, cpumask_t mask)
 	return err;
 }
 
+static int __assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+	unsigned int old_vector;
+	int i, cpu;
+	struct irq_cfg *cfg;
+
+	/*
+	 * We've got to be careful not to trash gate 0x80,
+	 * because int 0x80 is hm, kind of importantish. ;)
+	 */
+	BUG_ON((unsigned)irq + count > NR_IRQS);
+
+	/* Only try and allocate irqs on cpus that are present */
+	cpus_and(mask, mask, cpu_online_map);
+
+	for (i = 0; i < count; i++) {
+		cfg = &irq_cfg[irq + i];
+		if ((cfg->move_in_progress) || cfg->move_cleanup_count)
+			return -EBUSY;
+	}
+
+	cfg = &irq_cfg[irq];
+	old_vector = cfg->vector;
+	if (old_vector) {
+		cpumask_t tmp;
+		cpus_and(tmp, cfg->domain, mask);
+		if (!cpus_empty(tmp))
+			return 0;
+	}
+
+	for_each_cpu_mask(cpu, mask) {
+		cpumask_t domain, new_mask;
+		int new_cpu;
+		int vector;
+
+		domain = vector_allocation_domain(cpu);
+		cpus_and(new_mask, domain, cpu_online_map);
+
+		vector = current_vector & ~(count - 1);
+ next:
+		vector += count;
+		if (vector + count >= FIRST_SYSTEM_VECTOR) {
+			vector = FIRST_DEVICE_VECTOR & ~(count - 1);
+			if (vector < FIRST_DEVICE_VECTOR)
+				vector += count;
+		}
+		if (unlikely(vector == (current_vector & ~(count - 1))))
+			continue;
+		if ((IA32_SYSCALL_VECTOR >= vector) &&
+		    (IA32_SYSCALL_VECTOR < vector + count))
+			goto next;
+		for_each_cpu_mask(new_cpu, new_mask) {
+			for (i = 0; i < count; i++) {
+				if (per_cpu(vector_irq, new_cpu)[vector + i]
+									!= -1)
+					goto next;
+			}
+		}
+		/* Found one! */
+		current_vector = vector + count - 1;
+		for (i = 0; i < count; i++) {
+			cfg = &irq_cfg[irq + i];
+			if (old_vector) {
+				cfg->move_in_progress = 1;
+				cfg->old_domain = cfg->domain;
+			}
+			for_each_cpu_mask(new_cpu, new_mask) {
+				per_cpu(vector_irq, new_cpu)[vector + i] =
+					irq + i;
+			}
+			cfg->vector = vector;
+			cfg->domain = domain;
+		}
+		return 0;
+	}
+	return -ENOSPC;
+}
+
+/* Assumes that count is a power of two and aligns to that power of two */
+static int assign_irq_vector_block(int irq, int count, cpumask_t mask)
+{
+	int result;
+	unsigned long flags;
+
+	spin_lock_irqsave(&vector_lock, flags);
+	result = __assign_irq_vector_block(irq, count, mask);
+	spin_unlock_irqrestore(&vector_lock, flags);
+
+	return result;
+}
+
 static void __clear_irq_vector(int irq)
 {
 	struct irq_cfg *cfg;
@@ -788,6 +881,14 @@ static void __clear_irq_vector(int irq)
 	cpus_clear(cfg->domain);
 }
 
+static void __clear_irq_vector_block(int irq, int count)
+{
+	while (count > 0) {
+		count--;
+		__clear_irq_vector(irq + count);
+	}
+}
+
 void __setup_vector_irq(int cpu)
 {
 	/* Initialize vector_irq on a new cpu */
@@ -1895,30 +1996,56 @@ device_initcall(ioapic_init_sysfs);
 /*
  * Dynamic irq allocate and deallocation
  */
-int create_irq(void)
+
+/*
+ * On success, returns the interrupt number of the lowest numbered irq
+ * in the block.  If it can't find a block of the right size, it returns
+ * -1 - (length of the longest run).
+ */
+static int create_irq_block(int count)
 {
-	/* Allocate an unused irq */
-	int irq;
-	int new;
+	/* Allocate 'count' consecutive unused irqs */
+	int i, new, longest;
 	unsigned long flags;
 
-	irq = -ENOSPC;
+	longest = 0;
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = (NR_IRQS - 1); new >= 0; new--) {
 		if (platform_legacy_irq(new))
-			continue;
+			goto clear;
 		if (irq_cfg[new].vector != 0)
+			goto clear;
+		longest++;
+		if (longest < count)
 			continue;
-		if (__assign_irq_vector(new, TARGET_CPUS) == 0)
-			irq = new;
+
+		while (__assign_irq_vector_block(new, longest, TARGET_CPUS))
+			longest /= 2;
+		if (longest < count)
+			__clear_irq_vector_block(new, longest);
 		break;
+ clear:
+		__clear_irq_vector_block(new + 1, longest);
+		longest = 0;
 	}
 	spin_unlock_irqrestore(&vector_lock, flags);
 
-	if (irq >= 0) {
-		dynamic_irq_init(irq);
+	if (longest < count)
+		return -1 - longest;
+
+	for (i = 0; i < count; i++) {
+		dynamic_irq_init(new + i);
 	}
-	return irq;
+
+	return new;
+}
+
+int create_irq(void)
+{
+	int ret = create_irq_block(1);
+	if (ret < 0)
+		return -ENOSPC;
+	return ret;
 }
 
 void destroy_irq(unsigned int irq)
@@ -1936,7 +2063,8 @@ void destroy_irq(unsigned int irq)
  * MSI message composition
  */
 #ifdef CONFIG_PCI_MSI
-static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
+static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
+				unsigned int count, struct msi_msg *msg)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	int err;
@@ -1944,7 +2072,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 	cpumask_t tmp;
 
 	tmp = TARGET_CPUS;
-	err = assign_irq_vector(irq, tmp);
+	if (count == 1)
+		err = assign_irq_vector(irq, tmp);
+	else
+		err = assign_irq_vector_block(irq, count, tmp);
 	if (!err) {
 		cpus_and(tmp, cfg->domain, tmp);
 		dest = cpu_mask_to_apicid(tmp);
@@ -1975,6 +2106,8 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
+	struct msi_desc *desc = get_irq_msi(irq);
+	int i, count = 1 << desc->msi_attrib.multiple;
 	struct msi_msg msg;
 	unsigned int dest;
 	cpumask_t tmp;
@@ -1983,8 +2116,15 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	if (cpus_empty(tmp))
 		return;
 
-	if (assign_irq_vector(irq, mask))
-		return;
+	if (count > 1) {
+		/* Multiple MSIs all go to the same destination */
+		irq = desc->irq;
+		if (assign_irq_vector_block(irq, count, mask))
+			return;
+	} else {
+		if (assign_irq_vector(irq, mask))
+			return;
+	}
 
 	cpus_and(tmp, cfg->domain, mask);
 	dest = cpu_mask_to_apicid(tmp);
@@ -1997,7 +2137,9 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg(irq, &msg);
-	irq_desc[irq].affinity = mask;
+
+	for (i = 0; i < count; i++)
+		irq_desc[irq + i].affinity = mask;
 }
 #endif /* CONFIG_SMP */
 
@@ -2016,28 +2158,59 @@ static struct irq_chip msi_chip = {
 	.retrigger	= ioapic_retrigger_irq,
 };
 
-int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
+static int x86_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *desc, int count)
 {
 	struct msi_msg msg;
-	int irq, ret;
-	irq = create_irq();
-	if (irq < 0)
-		return irq;
-
-	ret = msi_compose_msg(dev, irq, &msg);
-	if (ret < 0) {
-		destroy_irq(irq);
-		return ret;
+	int i, ret, base_irq, alloc;
+
+	/* MSI can only allocate a power-of-two */
+	alloc = roundup_pow_of_two(count);
+
+	base_irq = create_irq_block(alloc);
+	if (base_irq < 0) {
+		if (alloc == 1)
+			return -ENOSPC;
+		return rounddown_pow_of_two(-base_irq - 1);
 	}
 
-	set_irq_msi(irq, desc);
-	write_msi_msg(irq, &msg);
+	ret = msi_compose_msg(pdev, base_irq, alloc, &msg);
+	if (ret)
+		return ret;
+
+	desc->msi_attrib.multiple = order_base_2(alloc);
 
-	set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
+	/* Do loop in reverse so set_irq_msi ends up setting
+	 * desc->irq to base_irq
+	 */
+	for (i = count - 1; i >= 0; i--) {
+		set_irq_msi(base_irq + i, desc);
+		set_irq_chip_and_handler_name(base_irq + i, &msi_chip,
+						handle_edge_irq, "edge");
+	}
+	write_msi_msg(base_irq, &msg);
 
 	return 0;
 }
 
+int arch_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type)
+{
+	struct msi_desc *desc;
+	int ret;
+
+	if (type == PCI_CAP_ID_MSI) {
+		desc = list_first_entry(&pdev->msi_list, struct msi_desc, list);
+		ret = x86_setup_msi_irq(pdev, desc, nvec);
+	} else {
+		list_for_each_entry(desc, &pdev->msi_list, list) {
+			ret = x86_setup_msi_irq(pdev, desc, 1);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
 void arch_teardown_msi_irq(unsigned int irq)
 {
 	destroy_irq(irq);
@@ -2090,7 +2263,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 	int ret;
 	struct msi_msg msg;
 
-	ret = msi_compose_msg(NULL, irq, &msg);
+	ret = msi_compose_msg(NULL, irq, 1, &msg);
 	if (ret < 0)
 		return ret;
 	dmar_msi_write(irq, &msg);
-- 
1.5.5.4

--
To unsubscribe from this list: send the line "unsubscribe linux-ide" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html