RE: [PATCH for-next] IB/ipoib: Increase IPOIB Datagram mode MTU's upper limit

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 




> -----Original Message-----
> From: linux-rdma-owner@xxxxxxxxxxxxxxx [mailto:linux-rdma-
> owner@xxxxxxxxxxxxxxx] On Behalf Of Dennis Dalessandro
> Sent: Wednesday, May 23, 2018 10:37 AM
> To: jgg@xxxxxxxx; dledford@xxxxxxxxxx
> Cc: linux-rdma@xxxxxxxxxxxxxxx; Mike Marciniszyn
> <mike.marciniszyn@xxxxxxxxx>; Ira Weiny <ira.weiny@xxxxxxxxx>; Sadanand
> Warrier <sadanand.warrier@xxxxxxxxx>
> Subject: [PATCH for-next] IB/ipoib: Increase IPOIB Datagram mode MTU's upper
> limit
> 
> From: Sadanand Warrier <sadanand.warrier@xxxxxxxxx>
> 
> Currently the IPOIB ud mtu is restricted to 4Kbytes. Remove this limitation so
> that the IPOIB module can potentially use an MTU that is bounded by the MTU
> of the underlying device.
> 
> Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx>
> Reviewed-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> Reviewed-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx>
> Signed-off-by: Sadanand Warrier <sadanand.warrier@xxxxxxxxx>
> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx>
> ---
>  drivers/infiniband/hw/hfi1/hfi.h               |    6 +++
>  drivers/infiniband/hw/hfi1/qp.c                |   21 ++---------
>  drivers/infiniband/hw/hfi1/verbs.c             |    9 +++--
>  drivers/infiniband/hw/qib/qib_qp.c             |    2 +
>  drivers/infiniband/hw/qib/qib_verbs.c          |    2 +
>  drivers/infiniband/sw/rdmavt/qp.c              |    2 +
>  drivers/infiniband/ulp/ipoib/ipoib_main.c      |    2 +
>  drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    8 +++-
>  include/rdma/ib_verbs.h                        |   48 ++++++++++++++++++++++++
>  include/rdma/opa_port_info.h                   |    8 ----
>  include/rdma/rdma_vt.h                         |    2 +
>  11 files changed, 73 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h
> index dd84238..de1b02a 100644
> --- a/drivers/infiniband/hw/hfi1/hfi.h
> +++ b/drivers/infiniband/hw/hfi1/hfi.h
> @@ -1721,6 +1721,12 @@ static inline int valid_ib_mtu(unsigned int mtu)
>  		mtu == 4096;
>  }
> 
> +static inline int valid_opa_mtu(unsigned int mtu) {
> +	return  valid_ib_mtu(mtu) || mtu == 8192 ||
> +		mtu == 10240;
> +}
> +
>  static inline int valid_opa_max_mtu(unsigned int mtu)  {
>  	return mtu >= 2048 &&
> diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c
> index 1697d96..977bacd 100644
> --- a/drivers/infiniband/hw/hfi1/qp.c
> +++ b/drivers/infiniband/hw/hfi1/qp.c
> @@ -168,15 +168,6 @@ static void flush_iowait(struct rvt_qp *qp)
>  	write_sequnlock_irqrestore(lock, flags);  }
> 
> -static inline int opa_mtu_enum_to_int(int mtu) -{
> -	switch (mtu) {
> -	case OPA_MTU_8192:  return 8192;
> -	case OPA_MTU_10240: return 10240;
> -	default:            return -1;
> -	}
> -}
> -
>  /**
>   * This function is what we would push to the core layer if we wanted to be a
>   * "first class citizen".  Instead we hide this here and rely on Verbs ULPs @@ -
> 184,15 +175,9 @@ static inline int opa_mtu_enum_to_int(int mtu)
>   */
>  static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu
> mtu)  {
> -	int val;
> -
> -	/* Constraining 10KB packets to 8KB packets */
>  	if (mtu == (enum ib_mtu)OPA_MTU_10240)
>  		mtu = OPA_MTU_8192;
> -	val = opa_mtu_enum_to_int((int)mtu);
> -	if (val > 0)
> -		return val;
> -	return ib_mtu_enum_to_int(mtu);
> +	return opa_mtu_enum_to_int((enum opa_mtu)mtu);
>  }
> 
>  int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, @@ -
> 308,13 +293,13 @@ int hfi1_check_send_wqe(struct rvt_qp *qp,
>  		break;
>  	case IB_QPT_SMI:
>  		ah = ibah_to_rvtah(wqe->ud_wr.ah);
> -		if (wqe->length > (1 << ah->log_pmtu))
> +		if (wqe->length > ah->pmtu)
>  			return -EINVAL;
>  		break;
>  	case IB_QPT_GSI:
>  	case IB_QPT_UD:
>  		ah = ibah_to_rvtah(wqe->ud_wr.ah);
> -		if (wqe->length > (1 << ah->log_pmtu))
> +		if (wqe->length > ah->pmtu)
>  			return -EINVAL;
>  		if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf)
>  			return -EINVAL;
> diff --git a/drivers/infiniband/hw/hfi1/verbs.c
> b/drivers/infiniband/hw/hfi1/verbs.c
> index fc2e44c..ab67f8d 100644
> --- a/drivers/infiniband/hw/hfi1/verbs.c
> +++ b/drivers/infiniband/hw/hfi1/verbs.c
> @@ -1481,9 +1481,10 @@ static int query_port(struct rvt_dev_info *rdi, u8
> port_num,
>  	 * from the Path Records to us will get the new 8k MTU.  Those that
>  	 * attempt to process the MTU enum may fail in various ways.
>  	 */
> -	props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ?
> -				      4096 : hfi1_max_mtu), IB_MTU_4096);
> -	props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu :
> +	props->max_mtu = mtu_to_enum((!valid_opa_mtu(hfi1_max_mtu) ?
> +				      4096 : hfi1_max_mtu),
> +				     IB_MTU_4096);
> +	props->active_mtu = !valid_opa_mtu(ppd->ibmtu) ? props->max_mtu :
>  		mtu_to_enum(ppd->ibmtu, IB_MTU_4096);
> 
>  	/*
> @@ -1617,7 +1618,7 @@ static void hfi1_notify_new_ah(struct ib_device
> *ibdev,
>  	dd = dd_from_ppd(ppd);
>  	ah->vl = sc_to_vlt(dd, sc5);
>  	if (ah->vl < num_vls || ah->vl == 15)
> -		ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu);
> +		ah->pmtu = dd->vld[ah->vl].mtu;
>  }
> 
>  /**
> diff --git a/drivers/infiniband/hw/qib/qib_qp.c
> b/drivers/infiniband/hw/qib/qib_qp.c
> index 344e401..59f0093 100644
> --- a/drivers/infiniband/hw/qib/qib_qp.c
> +++ b/drivers/infiniband/hw/qib/qib_qp.c
> @@ -402,7 +402,7 @@ int qib_check_send_wqe(struct rvt_qp *qp,
>  	case IB_QPT_GSI:
>  	case IB_QPT_UD:
>  		ah = ibah_to_rvtah(wqe->ud_wr.ah);
> -		if (wqe->length > (1 << ah->log_pmtu))
> +		if (wqe->length > ah->pmtu)
>  			return -EINVAL;
>  		/* progress hint */
>  		ret = 1;
> diff --git a/drivers/infiniband/hw/qib/qib_verbs.c
> b/drivers/infiniband/hw/qib/qib_verbs.c
> index 14b4057..c5d0ae4 100644
> --- a/drivers/infiniband/hw/qib/qib_verbs.c
> +++ b/drivers/infiniband/hw/qib/qib_verbs.c
> @@ -1367,7 +1367,7 @@ static void qib_notify_new_ah(struct ib_device
> *ibdev,
>  	ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr));
>  	ppd = ppd_from_ibp(ibp);
>  	ah->vl = ibp->sl_to_vl[rdma_ah_get_sl(&ah->attr)];
> -	ah->log_pmtu = ilog2(ppd->ibmtu);
> +	ah->pmtu = ppd->ibmtu;
>  }
> 
>  struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) diff --git
> a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c
> index 6e9a351..00ddf97 100644
> --- a/drivers/infiniband/sw/rdmavt/qp.c
> +++ b/drivers/infiniband/sw/rdmavt/qp.c
> @@ -1835,7 +1835,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
>  	    qp->ibqp.qp_type != IB_QPT_RC) {
>  		struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
> 
> -		log_pmtu = ah->log_pmtu;
> +		log_pmtu =  ilog2(ah->pmtu);
I don’t know if UD is widely used in IPoIB or not.
But won't you have any performance impact by doing ilog2 on every work request?
Just curious.

>  		atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
>  	}
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index cf291f9..49af25f 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -2257,7 +2257,7 @@ void ipoib_set_dev_features(struct ipoib_dev_priv
> *priv, struct ib_device *hca)
>  		goto device_init_failed;
>  	}
> 
> -	priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
> +	priv->max_ib_mtu = rdma_mtu_enum_to_int(hca, port, attr.max_mtu);
> 
>  	/* MTU will be reset when mcast join happens */
>  	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu); diff --git
> a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> index 6709328..025cb07 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
> @@ -243,10 +243,14 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast
> *mcast,
>  		if (priv->mcast_mtu == priv->admin_mtu)
>  			priv->admin_mtu =
>  			priv->mcast_mtu =
> -			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast-
> >mcmember.mtu));
> +			IPOIB_UD_MTU(rdma_mtu_enum_to_int(priv->ca,
> +							  priv->port,
> +							  priv->broadcast-
> >mcmember.mtu));
>  		else
>  			priv->mcast_mtu =
> -			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast-
> >mcmember.mtu));
> +			IPOIB_UD_MTU(rdma_mtu_enum_to_int(priv->ca,
> +							  priv->port,
> +							  priv->broadcast-
> >mcmember.mtu));
> 
>  		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
>  		spin_unlock_irq(&priv->lock);
> diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index
> e849bd0..b1fd9b1 100644
> --- a/include/rdma/ib_verbs.h
> +++ b/include/rdma/ib_verbs.h
> @@ -395,6 +395,11 @@ enum ib_mtu {
>  	IB_MTU_4096 = 5
>  };
> 
> +enum opa_mtu {
> +	OPA_MTU_8192 = 6,
> +	OPA_MTU_10240 = 7
> +};
> +
>  static inline int ib_mtu_enum_to_int(enum ib_mtu mtu)  {
>  	switch (mtu) {
> @@ -421,6 +426,24 @@ static inline enum ib_mtu ib_mtu_int_to_enum(int
> mtu)
>  		return IB_MTU_256;
>  }
> 
> +static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) {
> +	switch (mtu) {
> +	case OPA_MTU_8192: return 8192;
> +	case OPA_MTU_10240: return 10240;
> +	default: return(ib_mtu_enum_to_int((enum ib_mtu) mtu));
> +	}
> +}
> +
> +static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) {
> +	if (mtu >= 10240)
> +		return OPA_MTU_10240;
> +	else if (mtu >= 8192)
> +		return OPA_MTU_8192;
> +	else return ((enum opa_mtu) ib_mtu_int_to_enum(mtu)); }
> +
>  enum ib_port_state {
>  	IB_PORT_NOP		= 0,
>  	IB_PORT_DOWN		= 1,
> @@ -2997,6 +3020,31 @@ static inline bool rdma_cap_read_inv(struct
> ib_device *dev, u32 port_num)
>  	return rdma_protocol_iwarp(dev, port_num);  }
> 
> +/**
> + * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not.
> + *
> + * @device: Device
> + * @port_num: 1 based Port number
> + *
> + * Return true if the port is an Intel OPA port false if not.
> + */
> +
> +static inline bool rdma_core_cap_opa_port(struct ib_device *device,
> +					  u32 port_num)
> +{
> +	return (device->port_immutable[port_num].core_cap_flags &
> RDMA_CORE_PORT_INTEL_OPA)
> +		== RDMA_CORE_PORT_INTEL_OPA;
> +}
> +
> +static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port,
> +				       int mtu)
> +{
> +	if (rdma_core_cap_opa_port(device, port))
> +		return opa_mtu_enum_to_int((enum opa_mtu) mtu);
> +	else
> +		return ib_mtu_enum_to_int((enum ib_mtu) mtu); }
> +

Above rdma function needs kdoc comment block, mainly to describe what is mtu as its typecasted internally.
��.n��������+%������w��{.n�����{���fk��ܨ}���Ơz�j:+v�����w����ޙ��&�)ߡ�a����z�ޗ���ݢj��w�f




[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux