[RFC 10/12] Drivers: hv: vmbus: Implement vmbus_irq_set_affinity

mhkelley58@xxxxxxxxx · Mon, 3 Jun 2024 22:09:38 -0700

From: Michael Kelley <mhklinux@xxxxxxxxxxx>

Pull out core code from target_cpu_store() to implement
vmbus_irq_set_affinity() so the affinity of VMBus channel interrupts
can be updated from user space via /proc/irq.

Since vmbus_irq_set_affinity() runs with interrupts disabled,
vmbus_send_modifychannel() can't wait for an ACK from Hyper-V. As
such, remove the "wait for ack" version of vmbus_send_modifychannel().
Not waiting isn't a problem unless the old CPU is quickly taken offline
before Hyper-V makes the change, which is dealt with in a subsequent
patch.

Also change target_cpu_store() to call irq_set_affinity() so that
changes made via /sys/bus/vmbus/devices/<guid>/channels/<nn>/cpu
are in sync with the /proc/irq interface. The cpus_read_lock() is
no longer needed in target_cpu_store() because irq_set_affinity()
ensures that the interrupt affinity is not set to an offline
CPU.

Signed-off-by: Michael Kelley <mhklinux@xxxxxxxxxxx>
---
 drivers/hv/channel.c   |  97 ++++++-------------------
 drivers/hv/vmbus_drv.c | 161 +++++++++++++++++++++++++----------------
 2 files changed, 121 insertions(+), 137 deletions(-)

diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c
index 1aa020b538f1..b7920072e243 100644
--- a/drivers/hv/channel.c
+++ b/drivers/hv/channel.c
@@ -212,79 +212,6 @@ int vmbus_send_tl_connect_request(const guid_t *shv_guest_servie_id,
 }
 EXPORT_SYMBOL_GPL(vmbus_send_tl_connect_request);
 
-static int send_modifychannel_without_ack(struct vmbus_channel *channel, u32 target_vp)
-{
-	struct vmbus_channel_modifychannel msg;
-	int ret;
-
-	memset(&msg, 0, sizeof(msg));
-	msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
-	msg.child_relid = channel->offermsg.child_relid;
-	msg.target_vp = target_vp;
-
-	ret = vmbus_post_msg(&msg, sizeof(msg), true);
-	trace_vmbus_send_modifychannel(&msg, ret);
-
-	return ret;
-}
-
-static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target_vp)
-{
-	struct vmbus_channel_modifychannel *msg;
-	struct vmbus_channel_msginfo *info;
-	unsigned long flags;
-	int ret;
-
-	info = kzalloc(sizeof(struct vmbus_channel_msginfo) +
-				sizeof(struct vmbus_channel_modifychannel),
-		       GFP_KERNEL);
-	if (!info)
-		return -ENOMEM;
-
-	init_completion(&info->waitevent);
-	info->waiting_channel = channel;
-
-	msg = (struct vmbus_channel_modifychannel *)info->msg;
-	msg->header.msgtype = CHANNELMSG_MODIFYCHANNEL;
-	msg->child_relid = channel->offermsg.child_relid;
-	msg->target_vp = target_vp;
-
-	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
-	list_add_tail(&info->msglistentry, &vmbus_connection.chn_msg_list);
-	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-
-	ret = vmbus_post_msg(msg, sizeof(*msg), true);
-	trace_vmbus_send_modifychannel(msg, ret);
-	if (ret != 0) {
-		spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
-		list_del(&info->msglistentry);
-		spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-		goto free_info;
-	}
-
-	/*
-	 * Release channel_mutex; otherwise, vmbus_onoffer_rescind() could block on
-	 * the mutex and be unable to signal the completion.
-	 *
-	 * See the caller target_cpu_store() for information about the usage of the
-	 * mutex.
-	 */
-	mutex_unlock(&vmbus_connection.channel_mutex);
-	wait_for_completion(&info->waitevent);
-	mutex_lock(&vmbus_connection.channel_mutex);
-
-	spin_lock_irqsave(&vmbus_connection.channelmsg_lock, flags);
-	list_del(&info->msglistentry);
-	spin_unlock_irqrestore(&vmbus_connection.channelmsg_lock, flags);
-
-	if (info->response.modify_response.status)
-		ret = -EAGAIN;
-
-free_info:
-	kfree(info);
-	return ret;
-}
-
 /*
  * Set/change the vCPU (@target_vp) the channel (@child_relid) will interrupt.
  *
@@ -294,14 +221,32 @@ static int send_modifychannel_with_ack(struct vmbus_channel *channel, u32 target
  * out an ACK, we can not know when the host will stop interrupting the "old"
  * vCPU and start interrupting the "new" vCPU for the given channel.
  *
+ * But even if Hyper-V provides the ACK, we don't wait for it because the
+ * caller, vmbus_irq_set_affinity(), is running with a spin lock held. The
+ * unknown delay in when the host will start interrupting the new vCPU is not
+ * a problem unless the old vCPU is taken offline, and that situation is dealt
+ * with separately in the CPU offlining path.
+ *
  * The CHANNELMSG_MODIFYCHANNEL message type is supported since VMBus version
  * VERSION_WIN10_V4_1.
  */
 int vmbus_send_modifychannel(struct vmbus_channel *channel, u32 target_vp)
 {
-	if (vmbus_proto_version >= VERSION_WIN10_V5_3)
-		return send_modifychannel_with_ack(channel, target_vp);
-	return send_modifychannel_without_ack(channel, target_vp);
+	struct vmbus_channel_modifychannel msg;
+	int ret;
+
+	if (vmbus_proto_version < VERSION_WIN10_V4_1)
+		return -EINVAL;
+
+	memset(&msg, 0, sizeof(msg));
+	msg.header.msgtype = CHANNELMSG_MODIFYCHANNEL;
+	msg.child_relid = channel->offermsg.child_relid;
+	msg.target_vp = target_vp;
+
+	ret = vmbus_post_msg(&msg, sizeof(msg), false);
+	trace_vmbus_send_modifychannel(&msg, ret);
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(vmbus_send_modifychannel);
 
diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c
index b73be7c02d37..87f2f3436136 100644
--- a/drivers/hv/vmbus_drv.c
+++ b/drivers/hv/vmbus_drv.c
@@ -22,7 +22,6 @@
 #include <linux/kernel_stat.h>
 #include <linux/of_address.h>
 #include <linux/clockchips.h>
-#include <linux/cpu.h>
 #include <linux/sched/isolation.h>
 #include <linux/sched/task_stack.h>
 
@@ -1322,10 +1321,107 @@ static irqreturn_t vmbus_percpu_isr(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
+/*
+ * This function is invoked by user space affinity changes initiated
+ * from /proc/irq/<nn> or from the legacy VMBus-specific interface at
+ * /sys/bus/vmbus/devices/<guid>/channels/<nn>/cpu.
+ *
+ * In the former case, the /proc implementation ensures that unmapping
+ * (i.e., deleting) the IRQ will pend while this function is in progress.
+ * Since deleting the channel unmaps the IRQ first, the channel can't go
+ * away either.
+ *
+ * In the latter case, the VMBus connection channel_mutex is held, which
+ * prevents channel deltion, and therefore IRQ unampping as well.
+ *
+ * So in both cases, accessing the channel and IRQ data structures is safe.
+ */
 int vmbus_irq_set_affinity(struct irq_data *data,
 				  const struct cpumask *dest, bool force)
 {
-	return 0;
+	static int next_cpu;
+	static cpumask_t tempmask;
+	int origin_cpu, target_cpu;
+	struct vmbus_channel *channel = irq_data_get_irq_handler_data(data);
+	int ret;
+
+	if (!channel) {
+		pr_err("Bad channel in vmbus_irq_set_affinity for relid %ld\n",
+				data->hwirq);
+		return -EINVAL;
+	}
+
+	/* Don't consider CPUs that are isolated */
+	if (housekeeping_enabled(HK_TYPE_MANAGED_IRQ))
+		cpumask_and(&tempmask, dest,
+				housekeeping_cpumask(HK_TYPE_MANAGED_IRQ));
+	else
+		cpumask_copy(&tempmask, dest);
+
+	/*
+	 * If Hyper-V is already targeting a CPU in the new affinity mask,
+	 * keep that targeting and Hyper-V doesn't need to be updated. But
+	 * still set effective affinity as it may be unset when the IRQ is
+	 * first created.
+	 */
+	origin_cpu = channel->target_cpu;
+	if (cpumask_test_cpu(origin_cpu, &tempmask)) {
+		target_cpu = origin_cpu;
+		goto update_effective;
+	}
+
+	/*
+	 * Pick a CPU from the new affinity mask. As a simple heuristic to
+	 * spread out the selection when the mask contains multiple CPUs,
+	 * start with whatever CPU was last selected.
+	 */
+	target_cpu = cpumask_next_wrap(next_cpu, &tempmask, nr_cpu_ids, false);
+	if (target_cpu >= nr_cpu_ids)
+		return -EINVAL;
+	next_cpu = target_cpu;
+
+	/*
+	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
+	 * avoid sending the message and fail here for such channels.
+	 */
+	if (channel->state != CHANNEL_OPENED_STATE)
+		return -EIO;
+
+	ret = vmbus_send_modifychannel(channel,
+				     hv_cpu_number_to_vp_number(target_cpu));
+	if (ret)
+		return ret;
+
+	/*
+	 * Warning.  At this point, there is *no* guarantee that the host will
+	 * have successfully processed the vmbus_send_modifychannel() request.
+	 * See the header comment of vmbus_send_modifychannel() for more info.
+	 *
+	 * Lags in the processing of the above vmbus_send_modifychannel() can
+	 * result in missed interrupts if the "old" target CPU is taken offline
+	 * before Hyper-V starts sending interrupts to the "new" target CPU.
+	 * hv_synic_cleanup() will ensure no interrupts are missed.
+	 *
+	 * But apart from this offlining scenario, the code tolerates such
+	 * lags.  It will function correctly even if a channel interrupt comes
+	 * in on a CPU that is different from the channel target_cpu value.
+	 */
+
+	channel->target_cpu = target_cpu;
+
+	/* See init_vp_index(). */
+	if (hv_is_perf_channel(channel))
+		hv_update_allocated_cpus(origin_cpu, target_cpu);
+
+	/* Currently set only for storvsc channels. */
+	if (channel->change_target_cpu_callback) {
+		(*channel->change_target_cpu_callback)(channel,
+				origin_cpu, target_cpu);
+	}
+
+update_effective:
+	irq_data_update_effective_affinity(data, cpumask_of(target_cpu));
+	return IRQ_SET_MASK_OK;
 }
 
 /*
@@ -1655,7 +1751,7 @@ static ssize_t target_cpu_show(struct vmbus_channel *channel, char *buf)
 static ssize_t target_cpu_store(struct vmbus_channel *channel,
 				const char *buf, size_t count)
 {
-	u32 target_cpu, origin_cpu;
+	u32 target_cpu;
 	ssize_t ret = count;
 
 	if (vmbus_proto_version < VERSION_WIN10_V4_1)
@@ -1668,17 +1764,6 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	if (target_cpu >= nr_cpumask_bits)
 		return -EINVAL;
 
-	if (!cpumask_test_cpu(target_cpu, housekeeping_cpumask(HK_TYPE_MANAGED_IRQ)))
-		return -EINVAL;
-
-	/* No CPUs should come up or down during this. */
-	cpus_read_lock();
-
-	if (!cpu_online(target_cpu)) {
-		cpus_read_unlock();
-		return -EINVAL;
-	}
-
 	/*
 	 * Synchronizes target_cpu_store() and channel closure:
 	 *
@@ -1703,55 +1788,9 @@ static ssize_t target_cpu_store(struct vmbus_channel *channel,
 	 */
 	mutex_lock(&vmbus_connection.channel_mutex);
 
-	/*
-	 * Hyper-V will ignore MODIFYCHANNEL messages for "non-open" channels;
-	 * avoid sending the message and fail here for such channels.
-	 */
-	if (channel->state != CHANNEL_OPENED_STATE) {
-		ret = -EIO;
-		goto cpu_store_unlock;
-	}
-
-	origin_cpu = channel->target_cpu;
-	if (target_cpu == origin_cpu)
-		goto cpu_store_unlock;
-
-	if (vmbus_send_modifychannel(channel,
-				     hv_cpu_number_to_vp_number(target_cpu))) {
-		ret = -EIO;
-		goto cpu_store_unlock;
-	}
-
-	/*
-	 * For version before VERSION_WIN10_V5_3, the following warning holds:
-	 *
-	 * Warning.  At this point, there is *no* guarantee that the host will
-	 * have successfully processed the vmbus_send_modifychannel() request.
-	 * See the header comment of vmbus_send_modifychannel() for more info.
-	 *
-	 * Lags in the processing of the above vmbus_send_modifychannel() can
-	 * result in missed interrupts if the "old" target CPU is taken offline
-	 * before Hyper-V starts sending interrupts to the "new" target CPU.
-	 * But apart from this offlining scenario, the code tolerates such
-	 * lags.  It will function correctly even if a channel interrupt comes
-	 * in on a CPU that is different from the channel target_cpu value.
-	 */
-
-	channel->target_cpu = target_cpu;
-
-	/* See init_vp_index(). */
-	if (hv_is_perf_channel(channel))
-		hv_update_allocated_cpus(origin_cpu, target_cpu);
-
-	/* Currently set only for storvsc channels. */
-	if (channel->change_target_cpu_callback) {
-		(*channel->change_target_cpu_callback)(channel,
-				origin_cpu, target_cpu);
-	}
+	ret = irq_set_affinity(channel->irq, cpumask_of(target_cpu));
 
-cpu_store_unlock:
 	mutex_unlock(&vmbus_connection.channel_mutex);
-	cpus_read_unlock();
 	return ret;
 }
 static VMBUS_CHAN_ATTR(cpu, 0644, target_cpu_show, target_cpu_store);
-- 
2.25.1