> -----Original Message----- > From: linux-rdma-owner@xxxxxxxxxxxxxxx [mailto:linux-rdma- > owner@xxxxxxxxxxxxxxx] On Behalf Of Dennis Dalessandro > Sent: Wednesday, May 23, 2018 10:37 AM > To: jgg@xxxxxxxx; dledford@xxxxxxxxxx > Cc: linux-rdma@xxxxxxxxxxxxxxx; Mike Marciniszyn > <mike.marciniszyn@xxxxxxxxx>; Ira Weiny <ira.weiny@xxxxxxxxx>; Sadanand > Warrier <sadanand.warrier@xxxxxxxxx> > Subject: [PATCH for-next] IB/ipoib: Increase IPOIB Datagram mode MTU's upper > limit > > From: Sadanand Warrier <sadanand.warrier@xxxxxxxxx> > > Currently the IPOIB ud mtu is restricted to 4Kbytes. Remove this limitation so > that the IPOIB module can potentially use an MTU that is bounded by the MTU > of the underlying device. > > Reviewed-by: Mike Marciniszyn <mike.marciniszyn@xxxxxxxxx> > Reviewed-by: Ira Weiny <ira.weiny@xxxxxxxxx> > Reviewed-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> > Signed-off-by: Sadanand Warrier <sadanand.warrier@xxxxxxxxx> > Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxx> > --- > drivers/infiniband/hw/hfi1/hfi.h | 6 +++ > drivers/infiniband/hw/hfi1/qp.c | 21 ++--------- > drivers/infiniband/hw/hfi1/verbs.c | 9 +++-- > drivers/infiniband/hw/qib/qib_qp.c | 2 + > drivers/infiniband/hw/qib/qib_verbs.c | 2 + > drivers/infiniband/sw/rdmavt/qp.c | 2 + > drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 + > drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 8 +++- > include/rdma/ib_verbs.h | 48 ++++++++++++++++++++++++ > include/rdma/opa_port_info.h | 8 ---- > include/rdma/rdma_vt.h | 2 + > 11 files changed, 73 insertions(+), 37 deletions(-) > > diff --git a/drivers/infiniband/hw/hfi1/hfi.h b/drivers/infiniband/hw/hfi1/hfi.h > index dd84238..de1b02a 100644 > --- a/drivers/infiniband/hw/hfi1/hfi.h > +++ b/drivers/infiniband/hw/hfi1/hfi.h > @@ -1721,6 +1721,12 @@ static inline int valid_ib_mtu(unsigned int mtu) > mtu == 4096; > } > > +static inline int valid_opa_mtu(unsigned int mtu) { > + return valid_ib_mtu(mtu) || mtu == 8192 || > + mtu == 10240; > +} > + > static inline int valid_opa_max_mtu(unsigned int mtu) { > return mtu >= 2048 && > diff --git a/drivers/infiniband/hw/hfi1/qp.c b/drivers/infiniband/hw/hfi1/qp.c > index 1697d96..977bacd 100644 > --- a/drivers/infiniband/hw/hfi1/qp.c > +++ b/drivers/infiniband/hw/hfi1/qp.c > @@ -168,15 +168,6 @@ static void flush_iowait(struct rvt_qp *qp) > write_sequnlock_irqrestore(lock, flags); } > > -static inline int opa_mtu_enum_to_int(int mtu) -{ > - switch (mtu) { > - case OPA_MTU_8192: return 8192; > - case OPA_MTU_10240: return 10240; > - default: return -1; > - } > -} > - > /** > * This function is what we would push to the core layer if we wanted to be a > * "first class citizen". Instead we hide this here and rely on Verbs ULPs @@ - > 184,15 +175,9 @@ static inline int opa_mtu_enum_to_int(int mtu) > */ > static inline int verbs_mtu_enum_to_int(struct ib_device *dev, enum ib_mtu > mtu) { > - int val; > - > - /* Constraining 10KB packets to 8KB packets */ > if (mtu == (enum ib_mtu)OPA_MTU_10240) > mtu = OPA_MTU_8192; > - val = opa_mtu_enum_to_int((int)mtu); > - if (val > 0) > - return val; > - return ib_mtu_enum_to_int(mtu); > + return opa_mtu_enum_to_int((enum opa_mtu)mtu); > } > > int hfi1_check_modify_qp(struct rvt_qp *qp, struct ib_qp_attr *attr, @@ - > 308,13 +293,13 @@ int hfi1_check_send_wqe(struct rvt_qp *qp, > break; > case IB_QPT_SMI: > ah = ibah_to_rvtah(wqe->ud_wr.ah); > - if (wqe->length > (1 << ah->log_pmtu)) > + if (wqe->length > ah->pmtu) > return -EINVAL; > break; > case IB_QPT_GSI: > case IB_QPT_UD: > ah = ibah_to_rvtah(wqe->ud_wr.ah); > - if (wqe->length > (1 << ah->log_pmtu)) > + if (wqe->length > ah->pmtu) > return -EINVAL; > if (ibp->sl_to_sc[rdma_ah_get_sl(&ah->attr)] == 0xf) > return -EINVAL; > diff --git a/drivers/infiniband/hw/hfi1/verbs.c > b/drivers/infiniband/hw/hfi1/verbs.c > index fc2e44c..ab67f8d 100644 > --- a/drivers/infiniband/hw/hfi1/verbs.c > +++ b/drivers/infiniband/hw/hfi1/verbs.c > @@ -1481,9 +1481,10 @@ static int query_port(struct rvt_dev_info *rdi, u8 > port_num, > * from the Path Records to us will get the new 8k MTU. Those that > * attempt to process the MTU enum may fail in various ways. > */ > - props->max_mtu = mtu_to_enum((!valid_ib_mtu(hfi1_max_mtu) ? > - 4096 : hfi1_max_mtu), IB_MTU_4096); > - props->active_mtu = !valid_ib_mtu(ppd->ibmtu) ? props->max_mtu : > + props->max_mtu = mtu_to_enum((!valid_opa_mtu(hfi1_max_mtu) ? > + 4096 : hfi1_max_mtu), > + IB_MTU_4096); > + props->active_mtu = !valid_opa_mtu(ppd->ibmtu) ? props->max_mtu : > mtu_to_enum(ppd->ibmtu, IB_MTU_4096); > > /* > @@ -1617,7 +1618,7 @@ static void hfi1_notify_new_ah(struct ib_device > *ibdev, > dd = dd_from_ppd(ppd); > ah->vl = sc_to_vlt(dd, sc5); > if (ah->vl < num_vls || ah->vl == 15) > - ah->log_pmtu = ilog2(dd->vld[ah->vl].mtu); > + ah->pmtu = dd->vld[ah->vl].mtu; > } > > /** > diff --git a/drivers/infiniband/hw/qib/qib_qp.c > b/drivers/infiniband/hw/qib/qib_qp.c > index 344e401..59f0093 100644 > --- a/drivers/infiniband/hw/qib/qib_qp.c > +++ b/drivers/infiniband/hw/qib/qib_qp.c > @@ -402,7 +402,7 @@ int qib_check_send_wqe(struct rvt_qp *qp, > case IB_QPT_GSI: > case IB_QPT_UD: > ah = ibah_to_rvtah(wqe->ud_wr.ah); > - if (wqe->length > (1 << ah->log_pmtu)) > + if (wqe->length > ah->pmtu) > return -EINVAL; > /* progress hint */ > ret = 1; > diff --git a/drivers/infiniband/hw/qib/qib_verbs.c > b/drivers/infiniband/hw/qib/qib_verbs.c > index 14b4057..c5d0ae4 100644 > --- a/drivers/infiniband/hw/qib/qib_verbs.c > +++ b/drivers/infiniband/hw/qib/qib_verbs.c > @@ -1367,7 +1367,7 @@ static void qib_notify_new_ah(struct ib_device > *ibdev, > ibp = to_iport(ibdev, rdma_ah_get_port_num(ah_attr)); > ppd = ppd_from_ibp(ibp); > ah->vl = ibp->sl_to_vl[rdma_ah_get_sl(&ah->attr)]; > - ah->log_pmtu = ilog2(ppd->ibmtu); > + ah->pmtu = ppd->ibmtu; > } > > struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid) diff --git > a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c > index 6e9a351..00ddf97 100644 > --- a/drivers/infiniband/sw/rdmavt/qp.c > +++ b/drivers/infiniband/sw/rdmavt/qp.c > @@ -1835,7 +1835,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp, > qp->ibqp.qp_type != IB_QPT_RC) { > struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah); > > - log_pmtu = ah->log_pmtu; > + log_pmtu = ilog2(ah->pmtu); I don’t know if UD is widely used in IPoIB or not. But won't you have any performance impact by doing ilog2 on every work request? Just curious. > atomic_inc(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount); > } > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > index cf291f9..49af25f 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > @@ -2257,7 +2257,7 @@ void ipoib_set_dev_features(struct ipoib_dev_priv > *priv, struct ib_device *hca) > goto device_init_failed; > } > > - priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu); > + priv->max_ib_mtu = rdma_mtu_enum_to_int(hca, port, attr.max_mtu); > > /* MTU will be reset when mcast join happens */ > priv->dev->mtu = IPOIB_UD_MTU(priv->max_ib_mtu); diff --git > a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c > b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c > index 6709328..025cb07 100644 > --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c > +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c > @@ -243,10 +243,14 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast > *mcast, > if (priv->mcast_mtu == priv->admin_mtu) > priv->admin_mtu = > priv->mcast_mtu = > - IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast- > >mcmember.mtu)); > + IPOIB_UD_MTU(rdma_mtu_enum_to_int(priv->ca, > + priv->port, > + priv->broadcast- > >mcmember.mtu)); > else > priv->mcast_mtu = > - IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast- > >mcmember.mtu)); > + IPOIB_UD_MTU(rdma_mtu_enum_to_int(priv->ca, > + priv->port, > + priv->broadcast- > >mcmember.mtu)); > > priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey); > spin_unlock_irq(&priv->lock); > diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index > e849bd0..b1fd9b1 100644 > --- a/include/rdma/ib_verbs.h > +++ b/include/rdma/ib_verbs.h > @@ -395,6 +395,11 @@ enum ib_mtu { > IB_MTU_4096 = 5 > }; > > +enum opa_mtu { > + OPA_MTU_8192 = 6, > + OPA_MTU_10240 = 7 > +}; > + > static inline int ib_mtu_enum_to_int(enum ib_mtu mtu) { > switch (mtu) { > @@ -421,6 +426,24 @@ static inline enum ib_mtu ib_mtu_int_to_enum(int > mtu) > return IB_MTU_256; > } > > +static inline int opa_mtu_enum_to_int(enum opa_mtu mtu) { > + switch (mtu) { > + case OPA_MTU_8192: return 8192; > + case OPA_MTU_10240: return 10240; > + default: return(ib_mtu_enum_to_int((enum ib_mtu) mtu)); > + } > +} > + > +static inline enum opa_mtu opa_mtu_int_to_enum(int mtu) { > + if (mtu >= 10240) > + return OPA_MTU_10240; > + else if (mtu >= 8192) > + return OPA_MTU_8192; > + else return ((enum opa_mtu) ib_mtu_int_to_enum(mtu)); } > + > enum ib_port_state { > IB_PORT_NOP = 0, > IB_PORT_DOWN = 1, > @@ -2997,6 +3020,31 @@ static inline bool rdma_cap_read_inv(struct > ib_device *dev, u32 port_num) > return rdma_protocol_iwarp(dev, port_num); } > > +/** > + * rdma_core_cap_opa_port - Return whether the RDMA Port is OPA or not. > + * > + * @device: Device > + * @port_num: 1 based Port number > + * > + * Return true if the port is an Intel OPA port false if not. > + */ > + > +static inline bool rdma_core_cap_opa_port(struct ib_device *device, > + u32 port_num) > +{ > + return (device->port_immutable[port_num].core_cap_flags & > RDMA_CORE_PORT_INTEL_OPA) > + == RDMA_CORE_PORT_INTEL_OPA; > +} > + > +static inline int rdma_mtu_enum_to_int(struct ib_device *device, u8 port, > + int mtu) > +{ > + if (rdma_core_cap_opa_port(device, port)) > + return opa_mtu_enum_to_int((enum opa_mtu) mtu); > + else > + return ib_mtu_enum_to_int((enum ib_mtu) mtu); } > + Above rdma function needs kdoc comment block, mainly to describe what is mtu as its typecasted internally. ��.n��������+%������w��{.n�����{���fk��ܨ}���Ơz�j:+v�����w����ޙ��&�)ߡ�a����z�ޗ���ݢj��w�f