Re: [PATCH net 2/2] net: bridge: switchdev: Skip MDB replays of pending events

Vladimir Oltean <olteanv@xxxxxxxxx> · Wed, 31 Jan 2024 15:51:57 +0200

On Wed, Jan 31, 2024 at 01:35:44PM +0100, Tobias Waldekranz wrote:
> Generating the list of events MDB to replay races against the IGMP/MLD
> snooping logic, which may concurrently enqueue events to the switchdev
> deferred queue, leading to duplicate events being sent to drivers.
> 
> Avoid this by grabbing the write-side lock of the MDB, and make sure
> that a deferred version of a replay event is not already enqueued to
> the switchdev deferred queue before adding it to the replay list.
> 
> An easy way to reproduce this issue, on an mv88e6xxx system, was to
> create a snooping bridge, and immediately add a port to it:
> 
>     root@infix-06-0b-00:~$ ip link add dev br0 up type bridge mcast_snooping 1 && \
>     > ip link set dev x3 up master br0
>     root@infix-06-0b-00:~$ ip link del dev br0
>     root@infix-06-0b-00:~$ mvls atu
>     ADDRESS             FID  STATE      Q  F  0  1  2  3  4  5  6  7  8  9  a
>     DEV:0 Marvell 88E6393X
>     33:33:00:00:00:6a     1  static     -  -  0  .  .  .  .  .  .  .  .  .  .
>     33:33:ff:87:e4:3f     1  static     -  -  0  .  .  .  .  .  .  .  .  .  .
>     ff:ff:ff:ff:ff:ff     1  static     -  -  0  1  2  3  4  5  6  7  8  9  a
>     root@infix-06-0b-00:~$
> 
> The two IPv6 groups remain in the hardware database because the
> port (x3) is notified of the host's membership twice: once in the
> original event and once in a replay. Since DSA tracks host addresses
> using reference counters, and only a single delete notification is
> sent, the count remains at 1 when the bridge is destroyed.
> 

It's not really my business as to how the network maintainers handle this,
but if you intend this to go to 'net', you should provide a Fixes: tag.

And to make a compelling case for a submission to 'net', you should
start off by explaining what the user-visible impact of the bug is.

> Signed-off-by: Tobias Waldekranz <tobias@xxxxxxxxxxxxxx>
> ---
>  net/bridge/br_switchdev.c | 44 ++++++++++++++++++++++++---------------
>  1 file changed, 27 insertions(+), 17 deletions(-)
> 
> diff --git a/net/bridge/br_switchdev.c b/net/bridge/br_switchdev.c
> index ee84e783e1df..a3481190d5e6 100644
> --- a/net/bridge/br_switchdev.c
> +++ b/net/bridge/br_switchdev.c
> @@ -595,6 +595,8 @@ br_switchdev_mdb_replay_one(struct notifier_block *nb, struct net_device *dev,
>  }
>  
>  static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
> +				      struct net_device *dev,
> +				      unsigned long action,
>  				      enum switchdev_obj_id id,
>  				      const struct net_bridge_mdb_entry *mp,
>  				      struct net_device *orig_dev)
> @@ -608,8 +610,17 @@ static int br_switchdev_mdb_queue_one(struct list_head *mdb_list,
>  	mdb->obj.id = id;
>  	mdb->obj.orig_dev = orig_dev;
>  	br_switchdev_mdb_populate(mdb, mp);
> -	list_add_tail(&mdb->obj.list, mdb_list);
>  
> +	if (switchdev_port_obj_is_deferred(dev, action, &mdb->obj)) {
> +		/* This event is already in the deferred queue of
> +		 * events, so this replay must be elided, lest the
> +		 * driver receives duplicate events for it.
> +		 */
> +		kfree(mdb);

Would it make sense to make "mdb" a local on-stack variable, and
kmemdup() it only if it needs to be queued?

> +		return 0;
> +	}
> +
> +	list_add_tail(&mdb->obj.list, mdb_list);
>  	return 0;
>  }
>  
> @@ -677,22 +688,26 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
>  	if (!br_opt_get(br, BROPT_MULTICAST_ENABLED))
>  		return 0;
>  
> -	/* We cannot walk over br->mdb_list protected just by the rtnl_mutex,
> -	 * because the write-side protection is br->multicast_lock. But we
> -	 * need to emulate the [ blocking ] calling context of a regular
> -	 * switchdev event, so since both br->multicast_lock and RCU read side
> -	 * critical sections are atomic, we have no choice but to pick the RCU
> -	 * read side lock, queue up all our events, leave the critical section
> -	 * and notify switchdev from blocking context.
> +	if (adding)
> +		action = SWITCHDEV_PORT_OBJ_ADD;
> +	else
> +		action = SWITCHDEV_PORT_OBJ_DEL;
> +
> +	/* br_switchdev_mdb_queue_one will take care to not queue a

() after function names

> +	 * replay of an event that is already pending in the switchdev
> +	 * deferred queue. In order to safely determine that, there
> +	 * must be no new deferred MDB notifications enqueued for the
> +	 * duration of the MDB scan. Therefore, grab the write-side
> +	 * lock to avoid racing with any concurrent IGMP/MLD snooping.
>  	 */
> -	rcu_read_lock();
> +	spin_lock_bh(&br->multicast_lock);
>  
>  	hlist_for_each_entry_rcu(mp, &br->mdb_list, mdb_node) {

hlist_for_each_entry()

>  		struct net_bridge_port_group __rcu * const *pp;
>  		const struct net_bridge_port_group *p;
>  
>  		if (mp->host_joined) {
> -			err = br_switchdev_mdb_queue_one(&mdb_list,
> +			err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
>  							 SWITCHDEV_OBJ_ID_HOST_MDB,
>  							 mp, br_dev);
>  			if (err) {
> @@ -706,7 +721,7 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
>  			if (p->key.port->dev != dev)
>  				continue;
>  
> -			err = br_switchdev_mdb_queue_one(&mdb_list,
> +			err = br_switchdev_mdb_queue_one(&mdb_list, dev, action,
>  							 SWITCHDEV_OBJ_ID_PORT_MDB,
>  							 mp, dev);
>  			if (err) {
> @@ -716,12 +731,7 @@ br_switchdev_mdb_replay(struct net_device *br_dev, struct net_device *dev,
>  		}
>  	}
>  
> -	rcu_read_unlock();
> -
> -	if (adding)
> -		action = SWITCHDEV_PORT_OBJ_ADD;
> -	else
> -		action = SWITCHDEV_PORT_OBJ_DEL;
> +	spin_unlock_bh(&br->multicast_lock);
>  
>  	list_for_each_entry(obj, &mdb_list, list) {
>  		err = br_switchdev_mdb_replay_one(nb, dev,
> -- 
> 2.34.1
>