Hi,
On 31. 10. 24, 8:46, 'Guanjun' wrote:
From: Guanjun <guanjun@xxxxxxxxxxxxxxxxx>
Commit c410abbbacb9 (genirq/affinity: Add is_managed to struct irq_affinity_desc)
introduced is_managed bit to struct irq_affinity_desc. Due to queue interrupts
treated as managed interrupts, in scenarios where a large number of
devices are present (using massive msix queue interrupts), an excessive number
of IRQ matrix bits (about num_online_cpus() * nvecs) are reserved during
interrupt allocation. This sequently leads to the situation where interrupts
for some devices cannot be properly allocated.
Support for limiting the number of managed interrupts on every node per allocation.
Signed-off-by: Guanjun <guanjun@xxxxxxxxxxxxxxxxx>
---
.../admin-guide/kernel-parameters.txt | 9 +++
block/blk-mq-cpumap.c | 2 +-
drivers/virtio/virtio_vdpa.c | 2 +-
fs/fuse/virtio_fs.c | 2 +-
include/linux/group_cpus.h | 2 +-
kernel/irq/affinity.c | 11 ++--
lib/group_cpus.c | 55 ++++++++++++++++++-
7 files changed, 73 insertions(+), 10 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 9b61097a6448..ac80f35d04c9 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3238,6 +3238,15 @@
different yeeloong laptops.
Example: machtype=lemote-yeeloong-2f-7inch
+ managed_irqs_per_node=
+ [KNL,SMP] Support for limiting the number of managed
+ interrupts on every node to prevent the case that
+ interrupts cannot be properly allocated where a large
+ number of devices are present. The default number is 0,
+ that means no limit to the number of managed irqs.
+ Format: integer between 0 and num_possible_cpus() / num_possible_nodes()
+ Default: 0
Kernel parameters suck. Esp. here you have to guess to even properly
boot. Could this be auto-tuned instead?
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -11,6 +11,30 @@
#ifdef CONFIG_SMP
+static unsigned int __read_mostly managed_irqs_per_node;
+static struct cpumask managed_irqs_cpumsk[MAX_NUMNODES] __cacheline_aligned_in_smp = {
This is quite excessive. On SUSE configs, this is 8192 cpu bits * 1024
nodes = 1 M. For everyone. You have to allocate this dynamically
instead. See e.g. setup_node_to_cpumask_map().
+ [0 ... MAX_NUMNODES-1] = {CPU_BITS_ALL}
+};
+
+static int __init irq_managed_setup(char *str)
+{
+ int ret;
+
+ ret = kstrtouint(str, 10, &managed_irqs_per_node);
+ if (ret < 0) {
+ pr_warn("managed_irqs_per_node= cannot parse, ignored\n");
could not be parsed
+ return 0;
+ }
+
+ if (managed_irqs_per_node * num_possible_nodes() > num_possible_cpus()) {
+ managed_irqs_per_node = num_possible_cpus() / num_possible_nodes();
+ pr_warn("managed_irqs_per_node= cannot be larger than %u\n",
+ managed_irqs_per_node);
+ }
+ return 1;
+}
+__setup("managed_irqs_per_node=", irq_managed_setup);
+
static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
unsigned int cpus_per_grp)
{
...
@@ -332,6 +380,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
/**
* group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality
* @numgrps: number of groups
+ * @is_managed: if these groups managed by kernel
are managed by the kernel
*
* Return: cpumask array if successful, NULL otherwise. And each element
* includes CPUs assigned to this group
thanks,
--
js
suse labs