[RFC PATCH v3 3/3] genirq/cpuhotplug: Adjust managed irqs according to change of housekeeping CPU

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Interrupts disturb real-time tasks on affined cpus.
To ensure CPU isolation for real-time tasks, interrupt handling must
be adjusted accordingly.
Non-managed interrupts can be configured from userspace,
while managed interrupts require adjustments in kernelspace.

Adjust status of managed interrupts according change
of housekeeping CPUs to support dynamic CPU isolation.

Signed-off-by: Costa Shulyupin <costa.shul@xxxxxxxxxx>

---

The following code is a proof of concept to validate
and review the correctness of the approach to solving the problem.

C++ comments denote temporary comments.

v3:
- rename `int managed_irq_affinity_adjust()`
  to `void managed_irq_adjust_activity()`
- Addresses Thomas Gleixner's comments:
- add locking to managed_irq_adjust_activity()
- add blk_mq_flush_on_cpu() to flush queues associated
  with isolated interrupts.
v2:
- refactor irq_affinity_adjust():
- add more comments
- add managed_irq_isolate() derived from migrate_one_irq(),
  irq_needs_fixup() and irq_fixup_move_pending()
- use irq_set_affinity() instead of irq_set_affinity_locked
- Addressed Gleixner's comments:
- use `struct cpumask *` instead of `cpumask_var_t` in function signature
- remove locking in irq_affinity_adjust()

v1:
- https://lore.kernel.org/lkml/20240516190437.3545310-5-costa.shul@xxxxxxxxxx/
---
 block/blk-mq.c           | 19 ++++++++
 include/linux/blk-mq.h   |  2 +
 include/linux/cpu.h      |  4 ++
 include/linux/irq.h      |  2 +
 kernel/cgroup/cpuset.c   |  1 +
 kernel/cpu.c             |  2 +-
 kernel/irq/cpuhotplug.c  | 99 ++++++++++++++++++++++++++++++++++++++++
 kernel/sched/isolation.c | 14 ++++--
 8 files changed, 139 insertions(+), 4 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2909c51a13bd..484b6488739b 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3756,6 +3756,25 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
 	return 0;
 }
 
+/*
+ * Draft function to flush data in
+ * block multi-queues to perform isolation
+ * of specified cpu from managed interrupts.
+ */
+void blk_mq_flush_on_cpu(int cpu)
+{
+	// TODO: Thoroughly test this code with high test coverage.
+	/* Calling:
+	 * - blk_mq_hctx_notify_offline()
+	 * - blk_mq_hctx_notify_dead()
+	 * - bio_cpu_dead()
+	 */
+	cpuhp_invoke_callback(cpu, CPUHP_AP_BLK_MQ_ONLINE, false, NULL, NULL);
+	cpuhp_invoke_callback(cpu, CPUHP_BLK_MQ_DEAD, false, NULL, NULL);
+	cpuhp_invoke_callback(cpu, CPUHP_BIO_DEAD, false, NULL, NULL);
+	blk_softirq_cpu_dead(cpu);
+}
+
 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
 {
 	if (!(hctx->flags & BLK_MQ_F_STACKING))
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2035fad3131f..286cd2a0ba84 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -929,6 +929,8 @@ void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
 unsigned int blk_mq_rq_cpu(struct request *rq);
 
+void blk_mq_flush_on_cpu(int cpu);
+
 bool __blk_should_fake_timeout(struct request_queue *q);
 static inline bool blk_should_fake_timeout(struct request_queue *q)
 {
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index bdcec1732445..f4504b3c129a 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -203,4 +203,8 @@ static inline bool cpu_mitigations_auto_nosmt(void)
 }
 #endif
 
+int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+				 bool bringup, struct hlist_node *node,
+				 struct hlist_node **lastp);
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/include/linux/irq.h b/include/linux/irq.h
index d73bbb7797d0..3974f95c1783 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -619,6 +619,8 @@ extern int irq_affinity_online_cpu(unsigned int cpu);
 # define irq_affinity_online_cpu	NULL
 #endif
 
+void managed_irq_adjust_activity(struct cpumask *enable_mask);
+
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ)
 void __irq_move_irq(struct irq_data *data);
 static inline void irq_move_irq(struct irq_data *data)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 5066397899c9..8364369976e4 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -95,6 +95,7 @@ static struct list_head remote_children;
 #define HOUSEKEEPING_FLAGS	(BIT(HK_TYPE_TIMER)  | BIT(HK_TYPE_RCU)  |\
 				 BIT(HK_TYPE_SCHED)  | BIT(HK_TYPE_MISC) |\
 				 BIT(HK_TYPE_DOMAIN) | BIT(HK_TYPE_WQ)   |\
+				 BIT(HK_TYPE_MANAGED_IRQ) |\
 				 BIT(HK_TYPE_KTHREAD))
 
 /*
diff --git a/kernel/cpu.c b/kernel/cpu.c
index afc920116d42..44c7da0e1b8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -171,7 +171,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
  *
  * Return: %0 on success or a negative errno code
  */
-static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
+int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
 				 bool bringup, struct hlist_node *node,
 				 struct hlist_node **lastp)
 {
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index b92cf6a76351..9233694d541a 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -279,3 +279,102 @@ int irq_affinity_online_cpu(unsigned int cpu)
 
 	return 0;
 }
+
+/*
+ * managed_irq_isolate() - Deactivate managed interrupts if necessary
+ */
+// derived from migrate_one_irq(), irq_needs_fixup(), irq_fixup_move_pending().
+// Finally, this function can be considered for
+// merging back with migrate_one_irq().
+static int managed_irq_isolate(struct irq_desc *desc)
+{
+	struct irq_data *d = irq_desc_get_irq_data(desc);
+	struct irq_chip *chip = irq_data_get_irq_chip(d);
+	const struct cpumask *a;
+	bool maskchip;
+	int err;
+
+	/*
+	 * Deactivate if:
+	 * - Interrupt is managed
+	 * - Interrupt is not per cpu
+	 * - Interrupt is started
+	 * - Effective affinity mask includes isolated CPUs
+	 */
+	if (!irqd_affinity_is_managed(d) || irqd_is_per_cpu(d) || !irqd_is_started(d)
+	    || cpumask_subset(irq_data_get_effective_affinity_mask(d),
+			      housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
+		return 0;
+	// TBD: it is required?
+	/*
+	 * Complete an eventually pending irq move cleanup. If this
+	 * interrupt was moved in hard irq context, then the vectors need
+	 * to be cleaned up. It can't wait until this interrupt actually
+	 * happens and this CPU was involved.
+	 */
+	irq_force_complete_move(desc);
+
+	if (irqd_is_setaffinity_pending(d)) {
+		irqd_clr_move_pending(d);
+		if (cpumask_intersects(desc->pending_mask,
+		    housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
+			a = irq_desc_get_pending_mask(desc);
+	} else {
+		a = irq_data_get_affinity_mask(d);
+	}
+
+	maskchip = chip->irq_mask && !irq_can_move_pcntxt(d) && !irqd_irq_masked(d);
+	if (maskchip)
+		chip->irq_mask(d);
+
+	if (!cpumask_intersects(a, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ))) {
+		/*
+		 * Shut managed interrupt down and leave the affinity untouched.
+		 * The effective affinity is reset to the first online CPU.
+		 */
+		irqd_set_managed_shutdown(d);
+		irq_shutdown_and_deactivate(desc);
+		return 0;
+	}
+
+	/*
+	 * Do not set the force argument of irq_do_set_affinity() as this
+	 * disables the masking of offline CPUs from the supplied affinity
+	 * mask and therefore might keep/reassign the irq to the isolated
+	 * CPU.
+	 */
+	err = irq_do_set_affinity(d, a, false);
+	if (err)
+		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
+				    d->irq, err);
+
+	if (maskchip)
+		chip->irq_unmask(d);
+
+	return err;
+}
+
+/** managed_irq_adjust_activity() - Deactivate of restore managed interrupts
+ * according to change of housekeeping cpumask.
+ *
+ * @enable_mask:	CPUs for which interrupts should be restored
+ */
+void managed_irq_adjust_activity(struct cpumask *enable_mask)
+{
+	unsigned int irq;
+
+	for_each_active_irq(irq) {
+		struct irq_desc *desc = irq_to_desc(irq);
+		unsigned long flags;
+		unsigned int cpu;
+
+		if (!desc)
+			continue;
+
+		raw_spin_lock_irqsave(&desc->lock, flags);
+		for_each_cpu(cpu, enable_mask)
+			irq_restore_affinity_of_irq(desc, cpu);
+		managed_irq_isolate(desc);
+		raw_spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index ebbb215505e8..d1a0f1b104da 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,6 +8,8 @@
  *
  */
 
+#include <linux/blk-mq.h>
+
 #undef pr_fmt
 #define pr_fmt(fmt)    "%s:%d: %s " fmt, __FILE__, __LINE__, __func__
 
@@ -152,10 +154,16 @@ static int housekeeping_update(enum hk_type type, const struct cpumask *update)
 	if (!static_branch_unlikely(&housekeeping_overridden))
 		static_key_enable_cpuslocked(&housekeeping_overridden.key);
 
-	/* Add here code to update dependent subsystems with
-	 * changes of the housekeeping masks.
-	 */
+	switch (type) {
+	case HK_TYPE_MANAGED_IRQ:
+		int cpu;
 
+		for_each_cpu(cpu, &masks->disable)
+			blk_mq_flush_on_cpu(cpu);
+		managed_irq_adjust_activity(&masks->enable);
+		break;
+	default:
+	}
 	kfree(masks);
 
 	return 0;
-- 
2.47.0





[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux