RE: Fwd: mlx5_ib_post_send panic on s390x

Parav Pandit <parav@xxxxxxxxxxxx> · Sun, 12 Mar 2017 20:38:04 +0000

I meant mlx4_ib* driver below. Sorry for typo.

> -----Original Message-----
> From: linux-rdma-owner@xxxxxxxxxxxxxxx [mailto:linux-rdma-
> owner@xxxxxxxxxxxxxxx] On Behalf Of Parav Pandit
> Sent: Sunday, March 12, 2017 3:21 PM
> To: Ursula Braun <ubraun@xxxxxxxxxxxxxxxxxx>; Eli Cohen
> <eli@xxxxxxxxxxxx>; Matan Barak <matanb@xxxxxxxxxxxx>
> Cc: Saeed Mahameed <saeedm@xxxxxxxxxxxx>; Leon Romanovsky
> <leonro@xxxxxxxxxxxx>; linux-rdma@xxxxxxxxxxxxxxx
> Subject: RE: Fwd: mlx5_ib_post_send panic on s390x
> 
> Hi Ursula,
> 
> > -----Original Message-----
> > From: linux-rdma-owner@xxxxxxxxxxxxxxx [mailto:linux-rdma-
> > owner@xxxxxxxxxxxxxxx] On Behalf Of Ursula Braun
> > Sent: Thursday, March 9, 2017 3:54 AM
> > To: Eli Cohen <eli@xxxxxxxxxxxx>; Matan Barak <matanb@xxxxxxxxxxxx>
> > Cc: Saeed Mahameed <saeedm@xxxxxxxxxxxx>; Leon Romanovsky
> > <leonro@xxxxxxxxxxxx>; linux-rdma@xxxxxxxxxxxxxxx
> > Subject: Re: Fwd: mlx5_ib_post_send panic on s390x
> >
> >
> >
> > On 03/06/2017 02:08 PM, Eli Cohen wrote:
> > >>>
> > >>> The problem seems to be caused by the usage of plain memcpy in
> > set_data_inl_seg().
> > >>> The address provided by SMC-code in struct ib_send_wr *wr is an
> > >>> address belonging to an area mapped with the ib_dma_map_single()
> > >>> call. On s390x those kind of addresses require extra access
> > >>> functions (see
> > arch/s390/include/asm/io.h).
> > >>>
> > >
> > > By definition, when you are posting a send request with inline, the
> > > address
> > must be mapped to the cpu so plain memcpy should work.
> > >
> > In the past I run SMC-R with Connect X3 cards. The mlx4 driver does
> > not seem to contain extra coding for IB_SEND_INLINE flag for
> > ib_post_send. Does this mean for SMC-R to run on Connect X3 cards the
> > IB_SEND_INLINE flag is ignored, and thus I needed the
> > ib_dma_map_single() call for the area used with ib_post_send()? Does
> > this mean I should stay away from the IB_SEND_INLINE flag, if I want
> > to run the same SMC-R code with both, Connect X3 cards and Connect X4
> cards?
> >
> I had encountered the same kernel panic that you mentioned last week on
> ConnectX-4 adapters with smc-r on x86_64.
> Shall I submit below fix to netdev mailing list?
> I have tested above change. I also have optimization that avoids dma mapping
> for wr_tx_dma_addr.
> 
> -               lnk->wr_tx_sges[i].addr =
> -                       lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE;
> +               lnk->wr_tx_sges[i].addr = (uintptr_t)(lnk->wr_tx_bufs +
> + i);
> 
> I also have fix for processing IB_SEND_INLINE in mlx4 driver on little older
> kernel base.
> I have attached below. I can rebase my kernel and provide fix in mlx5_ib driver.
> Let me know.
> 
> Regards,
> Parav Pandit
> 
> diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
> index a2e4ca5..0d984f5 100644
> --- a/drivers/infiniband/hw/mlx4/qp.c
> +++ b/drivers/infiniband/hw/mlx4/qp.c
> @@ -2748,6 +2748,7 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct
> ib_send_wr *wr,
>  	unsigned long flags;
>  	int nreq;
>  	int err = 0;
> +	int inl = 0;
>  	unsigned ind;
>  	int uninitialized_var(stamp);
>  	int uninitialized_var(size);
> @@ -2958,30 +2959,97 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct
> ib_send_wr *wr,
>  		default:
>  			break;
>  		}
> +		if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
> +			struct mlx4_wqe_inline_seg *seg;
> +			void *addr;
> +			int len, seg_len;
> +			int num_seg;
> +			int off, to_copy;
> +
> +			inl = 0;
> +
> +			seg = wqe;
> +			wqe += sizeof *seg;
> +			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
> +			num_seg = 0;
> +			seg_len = 0;
> +
> +			for (i = 0; i < wr->num_sge; ++i) {
> +				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
> +				len  = wr->sg_list[i].length;
> +				inl += len;
> +
> +				if (inl > 16) {
> +					inl = 0;
> +					err = ENOMEM;
> +					*bad_wr = wr;
> +					goto out;
> +				}
> 
> -		/*
> -		 * Write data segments in reverse order, so as to
> -		 * overwrite cacheline stamp last within each
> -		 * cacheline.  This avoids issues with WQE
> -		 * prefetching.
> -		 */
> +				while (len >= MLX4_INLINE_ALIGN - off) {
> +					to_copy = MLX4_INLINE_ALIGN - off;
> +					memcpy(wqe, addr, to_copy);
> +					len -= to_copy;
> +					wqe += to_copy;
> +					addr += to_copy;
> +					seg_len += to_copy;
> +					wmb(); /* see comment below */
> +					seg->byte_count =
> htonl(MLX4_INLINE_SEG | seg_len);
> +					seg_len = 0;
> +					seg = wqe;
> +					wqe += sizeof *seg;
> +					off = sizeof *seg;
> +					++num_seg;
> +				}
> 
> -		dseg = wqe;
> -		dseg += wr->num_sge - 1;
> -		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) /
> 16);
> +				memcpy(wqe, addr, len);
> +				wqe += len;
> +				seg_len += len;
> +				off += len;
> +			}
> 
> -		/* Add one more inline data segment for ICRC for MLX sends */
> -		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
> -			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
> -			     qp->mlx4_ib_qp_type &
> -			     (MLX4_IB_QPT_PROXY_SMI_OWNER |
> MLX4_IB_QPT_TUN_SMI_OWNER))) {
> -			set_mlx_icrc_seg(dseg + 1);
> -			size += sizeof (struct mlx4_wqe_data_seg) / 16;
> -		}
> +			if (seg_len) {
> +				++num_seg;
> +				/*
> +				 * Need a barrier here to make sure
> +				 * all the data is visible before the
> +				 * byte_count field is set.  Otherwise
> +				 * the HCA prefetcher could grab the
> +				 * 64-byte chunk with this inline
> +				 * segment and get a valid (!=
> +				 * 0xffffffff) byte count but stale
> +				 * data, and end up sending the wrong
> +				 * data.
> +				 */
> +				wmb();
> +				seg->byte_count = htonl(MLX4_INLINE_SEG |
> seg_len);
> +			}
> 
> -		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
> -			set_data_seg(dseg, wr->sg_list + i);
> +			size += (inl + num_seg * sizeof (*seg) + 15) / 16;
> +		} else {
> +			/*
> +			 * Write data segments in reverse order, so as to
> +			 * overwrite cacheline stamp last within each
> +			 * cacheline.  This avoids issues with WQE
> +			 * prefetching.
> +			 */
> +
> +			dseg = wqe;
> +			dseg += wr->num_sge - 1;
> +			size += wr->num_sge * (sizeof (struct
> mlx4_wqe_data_seg) / 16);
> +
> +			/* Add one more inline data segment for ICRC for MLX
> sends */
> +			if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI
> ||
> +				     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI
> ||
> +				     qp->mlx4_ib_qp_type &
> +				     (MLX4_IB_QPT_PROXY_SMI_OWNER |
> MLX4_IB_QPT_TUN_SMI_OWNER))) {
> +				set_mlx_icrc_seg(dseg + 1);
> +				size += sizeof (struct mlx4_wqe_data_seg) / 16;
> +			}
> 
> +			for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
> +				set_data_seg(dseg, wr->sg_list + i);
> +		}
>  		/*
>  		 * Possibly overwrite stamping in cacheline with LSO
>  		 * segment only after making sure all data segments
> 
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> > in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo
> > info at http://vger.kernel.org/majordomo-info.html
>  {.n +       +%  lzwm  b 맲  r  zX  ݙ     ܨ}   Ơz &j:+v        zZ+  +zf   h   ~    i   z  w   ?
> & )ߢf
��.n��������+%������w��{.n�����{���fk��ܨ}���Ơz�j:+v�����w����ޙ��&�)ߡ�a����z�ޗ���ݢj��w�f