RE: [PATCH v2,hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU assignments within a device

Haiyang Zhang <haiyangz@xxxxxxxxxxxxx> · Fri, 16 Jul 2021 17:47:58 +0000

> -----Original Message-----
> From: Michael Kelley <mikelley@xxxxxxxxxxxxx>
> Sent: Friday, July 16, 2021 1:45 PM
> To: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>; linux-
> hyperv@xxxxxxxxxxxxxxx
> Cc: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>; KY Srinivasan
> <kys@xxxxxxxxxxxxx>; linux-kernel@xxxxxxxxxxxxxxx
> Subject: RE: [PATCH v2,hyperv-fixes] Drivers: hv: vmbus: Fix duplicate CPU
> assignments within a device
> 
> From: LKML haiyangz <lkmlhyz@xxxxxxxxxxxxx> On Behalf Of Haiyang Zhang
> Sent: Thursday, July 15, 2021 6:38 PM
> >
> > The vmbus module uses a rotational algorithm to assign target CPUs to
> > device's channels. Depends on the timing of different device's channel
> 
> s/device's/a device's/
> s/Depends/Depending/
> 
> > offers, different channels of a device may be assigned to the same CPU.
> >
> > For example on a VM with 2 CPUs, if NIC A and B's channels are offered
> > in the following order, NIC A will have both channels on CPU0, and NIC
> > B will have both channels on CPU1 -- see below. This kind of
> > assignment causes RSS load that is spreading across different channels
> > to end up on the same CPU.
> >
> > Timing of channel offers:
> > NIC A channel 0
> > NIC B channel 0
> > NIC A channel 1
> > NIC B channel 1
> >
> > VMBUS ID 14: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} -
> Synthetic network adapter
> >         Device_ID = {cab064cd-1f31-47d5-a8b4-9d57e320cccd}
> >         Sysfs path: /sys/bus/vmbus/devices/cab064cd-1f31-47d5-a8b4-
> 9d57e320cccd
> >         Rel_ID=14, target_cpu=0
> >         Rel_ID=17, target_cpu=0
> >
> > VMBUS ID 16: Class_ID = {f8615163-df3e-46c5-913f-f2d2f965ed0e} -
> Synthetic network adapter
> >         Device_ID = {244225ca-743e-4020-a17d-d7baa13d6cea}
> >         Sysfs path: /sys/bus/vmbus/devices/244225ca-743e-4020-a17d-
> d7baa13d6cea
> >         Rel_ID=16, target_cpu=1
> >         Rel_ID=18, target_cpu=1
> >
> > Update the vmbus CPU assignment algorithm to avoid duplicate CPU
> > assignments within a device.
> >
> > The new algorithm iterates num_online_cpus + 1 times.
> > The existing rotational algorithm to find "next NUMA & CPU" is still here.
> > But if the resulting CPU is already used by the same device, it will
> > try the next CPU.
> > In the last iteration, it assigns the channel to the next available
> > CPU like the existing algorithm. This is not normally expected,
> > because during device probe, we limit the number of channels of a
> > device to be <= number of online CPUs.
> >
> > Signed-off-by: Haiyang Zhang <haiyangz@xxxxxxxxxxxxx>
> > ---
> >  drivers/hv/channel_mgmt.c | 96
> > ++++++++++++++++++++++++++-------------
> >  1 file changed, 64 insertions(+), 32 deletions(-)
> >
> > diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c
> > index caf6d0c4bc1b..8584914291e7 100644
> > --- a/drivers/hv/channel_mgmt.c
> > +++ b/drivers/hv/channel_mgmt.c
> > @@ -605,6 +605,17 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	 */
> >  	mutex_lock(&vmbus_connection.channel_mutex);
> >
> > +	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry)
> {
> > +		if (guid_equal(&channel->offermsg.offer.if_type,
> > +			       &newchannel->offermsg.offer.if_type) &&
> > +		    guid_equal(&channel->offermsg.offer.if_instance,
> > +			       &newchannel->offermsg.offer.if_instance)) {
> > +			fnew = false;
> > +			newchannel->primary_channel = channel;
> > +			break;
> > +		}
> > +	}
> > +
> >  	init_vp_index(newchannel);
> >
> >  	/* Remember the channels that should be cleaned up upon suspend.
> */
> > @@ -617,16 +628,6 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	 */
> >  	atomic_dec(&vmbus_connection.offer_in_progress);
> >
> > -	list_for_each_entry(channel, &vmbus_connection.chn_list, listentry)
> {
> > -		if (guid_equal(&channel->offermsg.offer.if_type,
> > -			       &newchannel->offermsg.offer.if_type) &&
> > -		    guid_equal(&channel->offermsg.offer.if_instance,
> > -			       &newchannel->offermsg.offer.if_instance)) {
> > -			fnew = false;
> > -			break;
> > -		}
> > -	}
> > -
> >  	if (fnew) {
> >  		list_add_tail(&newchannel->listentry,
> >  			      &vmbus_connection.chn_list); @@ -647,7 +648,6
> @@ static void
> > vmbus_process_offer(struct vmbus_channel *newchannel)
> >  		/*
> >  		 * Process the sub-channel.
> >  		 */
> > -		newchannel->primary_channel = channel;
> >  		list_add_tail(&newchannel->sc_list, &channel->sc_list);
> >  	}
> >
> > @@ -683,6 +683,30 @@ static void vmbus_process_offer(struct
> vmbus_channel *newchannel)
> >  	queue_work(wq, &newchannel->add_channel_work);  }
> >
> > +/*
> > + * Check if CPUs used by other channels of the same device.
> > + * It's should only be called by init_vp_index().
> 
> s/It's/It/
> 
> > + */
> > +static bool hv_cpuself_used(u32 cpu, struct vmbus_channel *chn) {
> > +	struct vmbus_channel *primary = chn->primary_channel;
> > +	struct vmbus_channel *sc;
> > +
> > +	lockdep_assert_held(&vmbus_connection.channel_mutex);
> > +
> > +	if (!primary)
> > +		return false;
> > +
> > +	if (primary->target_cpu == cpu)
> > +		return true;
> > +
> > +	list_for_each_entry(sc, &primary->sc_list, sc_list)
> > +		if (sc != chn && sc->target_cpu == cpu)
> > +			return true;
> > +
> > +	return false;
> > +}
> > +
> >  /*
> >   * We use this state to statically distribute the channel interrupt load.
> >   */
> > @@ -702,6 +726,7 @@ static int next_numa_node_id;  static void
> > init_vp_index(struct vmbus_channel *channel)  {
> >  	bool perf_chn = hv_is_perf_channel(channel);
> > +	u32 i, ncpu = num_online_cpus();
> >  	cpumask_var_t available_mask;
> >  	struct cpumask *alloced_mask;
> >  	u32 target_cpu;
> > @@ -724,31 +749,38 @@ static void init_vp_index(struct vmbus_channel
> *channel)
> >  		return;
> >  	}
> >
> > -	while (true) {
> > -		numa_node = next_numa_node_id++;
> > -		if (numa_node == nr_node_ids) {
> > -			next_numa_node_id = 0;
> > -			continue;
> > +	for (i = 1; i <= ncpu + 1; i++) {
> > +		while (true) {
> > +			numa_node = next_numa_node_id++;
> > +			if (numa_node == nr_node_ids) {
> > +				next_numa_node_id = 0;
> > +				continue;
> > +			}
> > +			if
> (cpumask_empty(cpumask_of_node(numa_node)))
> > +				continue;
> > +			break;
> > +		}
> > +		alloced_mask = &hv_context.hv_numa_map[numa_node];
> > +
> > +		if (cpumask_weight(alloced_mask) ==
> > +		    cpumask_weight(cpumask_of_node(numa_node))) {
> > +			/*
> > +			 * We have cycled through all the CPUs in the node;
> > +			 * reset the alloced map.
> > +			 */
> > +			cpumask_clear(alloced_mask);
> >  		}
> > -		if (cpumask_empty(cpumask_of_node(numa_node)))
> > -			continue;
> > -		break;
> > -	}
> > -	alloced_mask = &hv_context.hv_numa_map[numa_node];
> >
> > -	if (cpumask_weight(alloced_mask) ==
> > -	    cpumask_weight(cpumask_of_node(numa_node))) {
> > -		/*
> > -		 * We have cycled through all the CPUs in the node;
> > -		 * reset the alloced map.
> > -		 */
> > -		cpumask_clear(alloced_mask);
> > -	}
> > +		cpumask_xor(available_mask, alloced_mask,
> > +			    cpumask_of_node(numa_node));
> >
> > -	cpumask_xor(available_mask, alloced_mask,
> cpumask_of_node(numa_node));
> > +		target_cpu = cpumask_first(available_mask);
> > +		cpumask_set_cpu(target_cpu, alloced_mask);
> >
> > -	target_cpu = cpumask_first(available_mask);
> > -	cpumask_set_cpu(target_cpu, alloced_mask);
> > +		if (channel->offermsg.offer.sub_channel_index >= ncpu ||
> > +		    i > ncpu || !hv_cpuself_used(target_cpu, channel))
> > +			break;
> > +	}
> >
> >  	channel->target_cpu = target_cpu;
> >
> > --
> > 2.25.1
> 
> I like this approach much better.  I did some testing with a variety of CPU
> counts (1, 2, 3, 4, 8, and 32), NUMA nodes (1, 3, and 4) and with
> 4 SCSI controllers and 4 NICs.  Everything worked as expected, and it's
> definitely an improvement.
> 
> I flagged a couple of typos/wording errors, but maybe they can be fixed
> when the patch is pulled into hyperv-fixes.
> 
> Reviewed-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>
> Tested-by: Michael Kelley <mikelley@xxxxxxxxxxxxx>

Thank you for the review and test! I will just correct the comments and
submit a v3.

- Haiyang