RE: [PATCH v2,hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device

Michael Kelley <mikelley@xxxxxxxxxxxxx> · Fri, 16 Jul 2021 17:45:15 +0000

From: LKML haiyangz <lkmlhyz@xxxxxxxxxxxxx> On Behalf Of Haiyang Zhang Sent: Thursday, July 15, 2021 6:38 PM
> 
> The vmbus module uses a rotational algorithm to assign target CPUs to
> device's channels. Depends on the timing of different device's channel

s/device's/a device's/
s/Depends/Depending/

> offers, different channels of a device may be assigned to the same CPU.
> 
> For example on a VM with 2 CPUs, if NIC A and B's channels are offered
> in the following order, NIC A will have both channels on CPU0, and
> NIC B will have both channels on CPU1 -- see below. This kind of
> assignment causes RSS load that is spreading across different channels
> to end up on the same CPU.
> 
> Timing of channel offers:
> NIC A channel 0
> NIC B channel 0
> NIC A channel 1
> NIC B channel 1
> 
> VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
>         Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
>         Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-9d57e320cccd
>         Rel_ID=14, target_cpu=0
>         Rel_ID=17, target_cpu=0
> 
> VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} - Synthetic network adapter
>         Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
>         Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-d7baa13d6cea
>         Rel_ID=16, target_cpu=1
>         Rel_ID=18, target_cpu=1
> 
> Update the vmbus CPU assignment algorithm to avoid duplicate CPU
> assignments within a device.
> 
> The new algorithm iterates num_online_cpus + 1 times.
> The existing rotational algorithm to find "next NUMA & CPU" is still here.
> But if the resulting CPU is already used by the same device, it will try
> the next CPU.
> In the last iteration, it assigns the channel to the next available CPU
> like the existing algorithm. This is not normally expected, because
> during device probe, we limit the number of channels of a device to
> be <= number of online CPUs.
> 
> Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>
> ---
>  drivers/hv/channel_mgmt.c | 96 ++++++++++++++++++++++++++-------------
>  1 file changed, 64 insertions(+), 32 deletions(-)
> 
> diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> index caf6d0c4bc1b..8584914291e7 100644
> --- a/drivers/hv/channel_mgmt.c
> +++ b/drivers/hv/channel_mgmt.c
> @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	 */
>  	mutex_lock(&vmbus_connection.channel_mutex);
> 
> +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> +		if (guid_equal(&channel->offermsg.offer.if_type,
> +			       &newchannel->offermsg.offer.if_type) &&
> +		    guid_equal(&channel->offermsg.offer.if_instance,
> +			       &newchannel->offermsg.offer.if_instance)) {
> +			fnew = false;
> +			newchannel->primary_channel = channel;
> +			break;
> +		}
> +	}
> +
>  	init_vp_index(newchannel);
> 
>  	/* Remember the channels that should be cleaned up upon suspend. */
> @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	 */
>  	atomic_dec(&vmbus_connection.offer_in_progress);
> 
> -	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry) {
> -		if (guid_equal(&channel->offermsg.offer.if_type,
> -			       &newchannel->offermsg.offer.if_type) &&
> -		    guid_equal(&channel->offermsg.offer.if_instance,
> -			       &newchannel->offermsg.offer.if_instance)) {
> -			fnew = false;
> -			break;
> -		}
> -	}
> -
>  	if (fnew) {
>  		list_add_tail(&newchannel->listentry,
>  			      &vmbus_connection.chn_list);
> @@ -647,7 +648,6 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  		/*
>  		 * Process the sub-channel.
>  		 */
> -		newchannel->primary_channel = channel;
>  		list_add_tail(&newchannel->sc_list, &channel->sc_list);
>  	}
> 
> @@ -683,6 +683,30 @@ static void vmbus_process_offer(struct vmbus_channel *newchannel)
>  	queue_work(wq, &newchannel->add_channel_work);
>  }
> 
> +/*
> + * Check if CPUs used by other channels of the same device.
> + * It's should only be called by init_vp_index().

s/It's/It/

> + */
> +static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn)
> +{
> +	struct vmbus_channel *primary = chn->primary_channel;
> +	struct vmbus_channel *sc;
> +
> +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> +
> +	if (!primary)
> +		return false;
> +
> +	if (primary->target_cpu == cpu)
> +		return true;
> +
> +	list_for_each_entry(sc, &primary->sc_list, sc_list)
> +		if (sc != chn && sc->target_cpu == cpu)
> +			return true;
> +
> +	return false;
> +}
> +
>  /*
>   * We use this state to statically distribute the channel interrupt load.
>   */
> @@ -702,6 +726,7 @@ static int next_numa_node_id;
>  static void init_vp_index(struct vmbus_channel *channel)
>  {
>  	bool perf_chn = hv_is_perf_channel(channel);
> +	u32 i, ncpu = num_online_cpus();
>  	cpumask_var_t available_mask;
>  	struct cpumask *alloced_mask;
>  	u32 target_cpu;
> @@ -724,31 +749,38 @@ static void init_vp_index(struct vmbus_channel *channel)
>  		return;
>  	}
> 
> -	while (true) {
> -		numa_node = next_numa_node_id++;
> -		if (numa_node == nr_node_ids) {
> -			next_numa_node_id = 0;
> -			continue;
> +	for (i = 1; i <= ncpu + 1; i++) {
> +		while (true) {
> +			numa_node = next_numa_node_id++;
> +			if (numa_node == nr_node_ids) {
> +				next_numa_node_id = 0;
> +				continue;
> +			}
> +			if (cpumask_empty(cpumask_of_node(numa_node)))
> +				continue;
> +			break;
> +		}
> +		alloced_mask = &hv_context.hv_numa_map[numa_node];
> +
> +		if (cpumask_weight(alloced_mask) ==
> +		    cpumask_weight(cpumask_of_node(numa_node))) {
> +			/*
> +			 * We have cycled through all the CPUs in the node;
> +			 * reset the alloced map.
> +			 */
> +			cpumask_clear(alloced_mask);
>  		}
> -		if (cpumask_empty(cpumask_of_node(numa_node)))
> -			continue;
> -		break;
> -	}
> -	alloced_mask = &hv_context.hv_numa_map[numa_node];
> 
> -	if (cpumask_weight(alloced_mask) ==
> -	    cpumask_weight(cpumask_of_node(numa_node))) {
> -		/*
> -		 * We have cycled through all the CPUs in the node;
> -		 * reset the alloced map.
> -		 */
> -		cpumask_clear(alloced_mask);
> -	}
> +		cpumask_xor(available_mask, alloced_mask,
> +			    cpumask_of_node(numa_node));
> 
> -	cpumask_xor(available_mask, alloced_mask, cpumask_of_node(numa_node));
> +		target_cpu = cpumask_first(available_mask);
> +		cpumask_set_cpu(target_cpu, alloced_mask);
> 
> -	target_cpu = cpumask_first(available_mask);
> -	cpumask_set_cpu(target_cpu, alloced_mask);
> +		if (channel->offermsg.offer.sub_channel_index >= ncpu ||
> +		    i > ncpu || !hv_cpuself_used(target_cpu, channel))
> +			break;
> +	}
> 
>  	channel->target_cpu = target_cpu;
> 
> --
> 2.25.1

I like this approach much better.  I did some testing with a variety of
CPU counts (1, 2, 3, 4, 8, and 32), NUMA nodes (1, 3, and 4) and with
4 SCSI controllers and 4 NICs.  Everything worked as expected, and it's
definitely an improvement.

I flagged a couple of typos/wording errors, but maybe they can be
fixed when the patch is pulled into hyperv-fixes.

Reviewed-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>
Tested-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>