Re: [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> On 20 Jul 2020, at 14:22, Håkon Bugge <haakon.bugge@xxxxxxxxxx> wrote:
> 
> A CM REJ packet with its reason equal to timeout is a special beast in
> the sense that it doesn't have a Remote Communication ID nor does it
> have a Remote Port GID.
> 
> Using CX-3 virtual functions, either from a bare-metal machine or
> pass-through from a VM, MAD packets are proxied through the PF driver.
> 
> Since the VF drivers have separate name spaces for MAD Transaction Ids
> (TIDs), the PF driver has to re-map the TIDs and keep the book keeping
> in a cache.
> 
> This proxying doesn't not handle said REJ packets.
> 
> If the active side abandons its connection attempt after having sent a
> REQ, it will send a REJ with the reason being timeout. This example
> can be provoked by a simple user-verbs program, which ends up doing:
> 
>    rdma_connect(cm_id, &conn_param);
>    rdma_destroy_id(cm_id);
> 
> using the async librdmacm API.
> 
> Having dynamic debug prints enabled in the mlx4_ib driver, we will
> then see:
> 
> mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0, attr_id 0x12
> 
> The solution is to introduce a radix-tree. When a REQ packet is
> received and handled in mlx4_ib_demux_cm_handler(), we know the
> connecting peer's para-virtual cm_id and the destination slave. We
> then insert an entry into the tree with said information. We also
> schedule work to remove this entry from the tree and free it, in order
> to avoid memory leak.
> 
> When a REJ packet with reason timeout is received, we can look up the
> slave in the tree, and deliver the packet to the correct slave.
> 
> When cleaning up, we simply traverse the tree and modify any delayed
> work to use a zero delay. A subsequent flush of the system_wq will
> ensure all entries being wiped out.
> 
> Signed-off-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx>
> ---
> drivers/infiniband/hw/mlx4/cm.c      | 133 ++++++++++++++++++++++++++-
> drivers/infiniband/hw/mlx4/mlx4_ib.h |   3 +
> 2 files changed, 135 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
> index 6f0ffd0906e6..883436548901 100644
> --- a/drivers/infiniband/hw/mlx4/cm.c
> +++ b/drivers/infiniband/hw/mlx4/cm.c
> @@ -54,11 +54,22 @@ struct id_map_entry {
> 	struct delayed_work timeout;
> };
> 
> +struct rej_tmout_entry {
> +	int slave;
> +	u32 rem_pv_cm_id;
> +	struct delayed_work timeout;
> +	struct radix_tree_root *rej_tmout_root;
> +	/* Points to the mutex protecting this radix-tree */
> +	struct mutex *lock;
> +};
> +
> struct cm_generic_msg {
> 	struct ib_mad_hdr hdr;
> 
> 	__be32 local_comm_id;
> 	__be32 remote_comm_id;
> +	unsigned char unused[2];
> +	__be16 rej_reason;
> };
> 
> struct cm_sidr_generic_msg {
> @@ -285,6 +296,7 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
> 	spin_unlock(&sriov->id_map_lock);
> }
> 
> +#define REJ_REASON(m) be16_to_cpu(((struct cm_generic_msg *)(m))->rej_reason)
> int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
> 		struct ib_mad *mad)
> {
> @@ -295,7 +307,8 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
> 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
> -	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
> +	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
> +	    (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT)) {
> 		sl_cm_id = get_local_comm_id(mad);
> 		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
> 		if (id)
> @@ -328,11 +341,87 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
> 	return 0;
> }
> 
> +static void rej_tmout_timeout(struct work_struct *work)
> +{
> +	struct delayed_work *delay = to_delayed_work(work);
> +	struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout);
> +	struct rej_tmout_entry *deleted;
> +
> +	mutex_lock(item->lock);
> +	deleted = radix_tree_delete_item(item->rej_tmout_root, item->rem_pv_cm_id, NULL);
> +	mutex_unlock(item->lock);
> +
> +	if (deleted != item)
> +		pr_debug("deleted(%p) != item(%p)\n", deleted, item);
> +
> +	pr_debug("rej_tmout entry, rem_pv_cm_id 0x%x, slave %d deleted\n",
> +		 item->rem_pv_cm_id, item->slave);
> +	kfree(item);
> +}
> +
> +static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave)
> +{
> +	struct rej_tmout_entry *item;
> +	int sts;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +	if (item)
> +		return PTR_ERR(item);

Hmm, this shall read:
		return IS_ERR(item) ? PTR_ERR(item) : 0;

I'll also remove the noisy pr_debug()s in this commit.

Will wait before sending out a v2.


Thxs, Håkon


> +
> +	item = kmalloc(sizeof(*item), GFP_KERNEL);
> +	if (!item)
> +		return -ENOMEM;
> +
> +	INIT_DELAYED_WORK(&item->timeout, rej_tmout_timeout);
> +	item->slave = slave;
> +	item->rem_pv_cm_id = rem_pv_cm_id;
> +	item->rej_tmout_root = &sriov->rej_tmout_root;
> +	item->lock = &sriov->rej_tmout_lock;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	sts = radix_tree_insert(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id, item);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +	if (sts)
> +		goto err_insert;
> +
> +	pr_debug("Inserted rem_pv_cm_id 0x%x slave %d\n", rem_pv_cm_id, slave);
> +	schedule_delayed_work(&item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
> +
> +	return 0;
> +
> +err_insert:
> +	kfree(item);
> +	return sts;
> +}
> +
> +static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id)
> +{
> +	struct rej_tmout_entry *item;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +
> +	if (!item || IS_ERR(item)) {
> +		pr_debug("Could not find rem_pv_cm_id 0x%x error: %d\n",
> +			 rem_pv_cm_id, (int)PTR_ERR(item));
> +		return !item ? -ENOENT : PTR_ERR(item);
> +	}
> +	pr_debug("Found rem_pv_cm_id 0x%x slave: %d\n", rem_pv_cm_id, item->slave);
> +
> +	return item->slave;
> +}
> +
> int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
> 			     struct ib_mad *mad)
> {
> +	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
> +	u32 rem_pv_cm_id = get_local_comm_id(mad);
> 	u32 pv_cm_id;
> 	struct id_map_entry *id;
> +	int sts;
> 
> 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
> @@ -348,7 +437,18 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
> 				     be64_to_cpu(gid.global.interface_id));
> 			return -ENOENT;
> 		}
> +
> +		sts = alloc_rej_tmout(sriov, rem_pv_cm_id, *slave);
> +		if (sts)
> +			/* Even if this fails, we pass on the REQ to the slave */
> +			pr_debug("Could not allocate rej_tmout entry. rem_pv_cm_id 0x%x slave %d status %d\n",
> +				 rem_pv_cm_id, *slave, sts);
> +
> 		return 0;
> +	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT) {
> +		*slave = lookup_rej_tmout_slave(sriov, rem_pv_cm_id);
> +
> +		return *slave < 0 ? *slave : 0;
> 	}
> 
> 	pv_cm_id = get_remote_comm_id(mad);
> @@ -377,6 +477,35 @@ void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
> 	INIT_LIST_HEAD(&dev->sriov.cm_list);
> 	dev->sriov.sl_id_map = RB_ROOT;
> 	xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
> +	mutex_init(&dev->sriov.rej_tmout_lock);
> +	INIT_RADIX_TREE(&dev->sriov.rej_tmout_root, GFP_KERNEL);
> +}
> +
> +static void rej_tmout_tree_cleanup(struct mlx4_ib_sriov *sriov, int slave)
> +{
> +	struct radix_tree_iter iter;
> +	bool flush_needed = false;
> +	void **slot;
> +	int cnt = 0;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	radix_tree_for_each_slot(slot, &sriov->rej_tmout_root, &iter, 0) {
> +		struct rej_tmout_entry *item = *slot;
> +
> +		if (slave < 0 || slave == item->slave) {
> +			mod_delayed_work(system_wq, &item->timeout, 0);
> +			flush_needed = true;
> +			++cnt;
> +		}
> +	}
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +
> +	if (flush_needed) {
> +		flush_scheduled_work();
> +
> +		pr_debug("%d entries in radix_tree for slave %d during cleanup\n",
> +			 slave, cnt);
> +	}
> }
> 
> /* slave = -1 ==> all slaves */
> @@ -446,4 +575,6 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
> 		list_del(&map->list);
> 		kfree(map);
> 	}
> +
> +	rej_tmout_tree_cleanup(sriov, slave);
> }
> diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> index 20cfa69d0aed..92cb686bdc49 100644
> --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
> +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> @@ -495,6 +495,9 @@ struct mlx4_ib_sriov {
> 	spinlock_t id_map_lock;
> 	struct rb_root sl_id_map;
> 	struct list_head cm_list;
> +	/* Protects the radix-tree */
> +	struct mutex rej_tmout_lock;
> +	struct radix_tree_root rej_tmout_root;
> };
> 
> struct gid_cache_context {
> -- 
> 2.20.1
> 





[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux