[PATCH hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device

Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> · Thu, 15 Jul 2021 09:10:42 -0700

The vmbus module uses a rotational algorithm to assign target CPUs to
device's channels. Depends on the timing of different device's channel
offers, different channels of a device may be assigned to the same CPU.

For example on a VM with 2 CPUs, if the NIC A and B's channels offered
in the following order, the NIC A will have both channels on CPU0, and
NIC B will have both channels on CPU1 -- see below. This kind of
assignments cause RSS spreading loads among different channels ends up
on the same CPU.

Timing of channel offers:
NIC A channel 0
NIC B channel 0
NIC A channel 1
NIC B channel 1

VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
        Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
        Rel_ID=14, target_cpu=0
        Rel_ID=17, target_cpu=0

VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
        Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
        Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
        Rel_ID=16, target_cpu=1
        Rel_ID=18, target_cpu=1


Update the vmbus' CPU assignment algorithm to avoid duplicate CPU
assignments within a device.

The new algorithm iterates 2 * #NUMA_Node + 1 times. In the first
round of checking all NUMA nodes, it tries to find previously unassigned
CPUs by this and other devices. If not available, it clears the
allocated CPU mask.
In the second round, it tries to find unassigned CPUs by the same
device.
In the last iteration, it assigns the channel to the first available CPU.
This is not normally expected, because during device probe, we limit the
number of channels of a device to be <= number of online CPUs.

Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>

---
 drivers/hv/channel_mgmt.c | 95 ++++++++++++++++++++++++++-------------
 1 file changed, 65 insertions(+), 30 deletions(-)

diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
index caf6d0c4bc1b..fbddc4954f57 100644
--- a/drivers/hv/channel_mgmt.c
+++ b/drivers/hv/channel_mgmt.c
@@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	 */
 	mutex_lock(&vmbus_connection.channel_mutex);
 
+	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
+		if (guid_equal(&channel->offermsg.offer.if_type,
+			       &newchannel->offermsg.offer.if_type) &&
+		    guid_equal(&channel->offermsg.offer.if_instance,
+			       &newchannel->offermsg.offer.if_instance)) {
+			fnew = false;
+			newchannel->primary_channel = channel;
+			break;
+		}
+	}
+
 	init_vp_index(newchannel);
 
 	/* Remember the channels that should be cleaned up upon suspend. */
@@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	 */
 	atomic_dec(&vmbus_connection.offer_in_progress);
 
-	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
-		if (guid_equal(&channel->offermsg.offer.if_type,
-			       &newchannel->offermsg.offer.if_type) &&
-		    guid_equal(&channel->offermsg.offer.if_instance,
-			       &newchannel->offermsg.offer.if_instance)) {
-			fnew = false;
-			break;
-		}
-	}
-
 	if (fnew) {
 		list_add_tail(&newchannel->listentry,
 			      &vmbus_connection.chn_list);
@@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 		/*
 		 * Process the sub-channel.
 		 */
-		newchannel->primary_channel = channel;
 		list_add_tail(&newchannel->sc_list, &channel->sc_list);
 	}
 
@@ -683,6 +683,29 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
 	queue_work(wq, &newchannel->add_channel_work);
 }
 
+/*
+ * Clear CPUs used by other channels of the same device.
+ * It's should only be called by init_vp_index().
+ */
+static bool hv_clear_usedcpu(struct cpumask *cmask, struct vmbus_channel *chn)
+{
+	struct vmbus_channel *primary = chn->primary_channel;
+	struct vmbus_channel *sc;
+
+	lockdep_assert_held(&vmbus_connection.channel_mutex);
+
+	if (!primary)
+		return !cpumask_empty(cmask);
+
+	cpumask_clear_cpu(primary->target_cpu, cmask);
+
+	list_for_each_entry(sc, &primary->sc_list, sc_list)
+		if (sc != chn)
+			cpumask_clear_cpu(sc->target_cpu, cmask);
+
+	return !cpumask_empty(cmask);
+}
+
 /*
  * We use this state to statically distribute the channel interrupt load.
  */
@@ -705,7 +728,7 @@ static void init_vp_index(struct vmbus_channel *channel)
 	cpumask_var_t available_mask;
 	struct cpumask *alloced_mask;
 	u32 target_cpu;
-	int numa_node;
+	int numa_node, i;
 
 	if ((vmbus_proto_version == VERSION_WS2008) ||
 	    (vmbus_proto_version == VERSION_WIN7) || (!perf_chn) ||
@@ -724,29 +747,41 @@ static void init_vp_index(struct vmbus_channel *channel)
 		return;
 	}
 
-	while (true) {
-		numa_node = next_numa_node_id++;
-		if (numa_node == nr_node_ids) {
-			next_numa_node_id = 0;
-			continue;
+	for (i = 1; i <= nr_node_ids * 2 + 1; i++) {
+		while (true) {
+			numa_node = next_numa_node_id++;
+			if (numa_node == nr_node_ids) {
+				next_numa_node_id = 0;
+				continue;
+			}
+			if (cpumask_empty(cpumask_of_node(numa_node)))
+				continue;
+			break;
 		}
-		if (cpumask_empty(cpumask_of_node(numa_node)))
-			continue;
-		break;
-	}
-	alloced_mask = &hv_context.hv_numa_map[numa_node];
+		alloced_mask = &hv_context.hv_numa_map[numa_node];
+
+		if (cpumask_weight(alloced_mask) ==
+		    cpumask_weight(cpumask_of_node(numa_node))) {
+			/*
+			 * We have cycled through all the CPUs in the node;
+			 * reset the alloced map.
+			 */
+			cpumask_clear(alloced_mask);
+		}
+
+		cpumask_xor(available_mask, alloced_mask,
+			    cpumask_of_node(numa_node));
+
+		/* Try to avoid duplicate cpus within a device */
+		if (channel->offermsg.offer.sub_channel_index >=
+		    num_online_cpus() ||
+		    i > nr_node_ids * 2 ||
+		    hv_clear_usedcpu(available_mask, channel))
+			break;
 
-	if (cpumask_weight(alloced_mask) ==
-	    cpumask_weight(cpumask_of_node(numa_node))) {
-		/*
-		 * We have cycled through all the CPUs in the node;
-		 * reset the alloced map.
-		 */
 		cpumask_clear(alloced_mask);
 	}
 
-	cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
-
 	target_cpu = cpumask_first(available_mask);
 	cpumask_set_cpu(target_cpu, alloced_mask);
 
-- 
2.25.1