> From: linux-rdma-owner@xxxxxxxxxxxxxxx <linux-rdma- > owner@xxxxxxxxxxxxxxx> On Behalf Of Michal Kalderon > > > From: Jason Gunthorpe <jgg@xxxxxxxx> > > Sent: Tuesday, October 22, 2019 10:36 PM > > > > External Email > > > > ---------------------------------------------------------------------- > > On Wed, Oct 16, 2019 at 02:42:41PM +0300, Michal Kalderon wrote: > > > > > Re-design of the iWARP CM related objects reference > > > > > counting and synchronization methods, to ensure operations > > > > > are synchronized correctly and and that memory allocated for "ep" > > > > > is released. (was leaked). > > > > > Also makes sure QP memory is not released before ep is finished > > > > > accessing it. > > > > > > > > > > Where as the QP object is created/destroyed by external operations, > > > > > the ep is created/destroyed by internal operations and represents > > > > > the tcp connection associated with the QP. > > > > > > > > > > QP destruction flow: > > > > > - needs to wait for ep establishment to complete (either > > > successfully > > > > > or with error) > > > > > - needs to wait for ep disconnect to be fully posted to avoid a > > > > > race condition of disconnect being called after reset. > > > > > - both the operations above don't always happen, so we use atomic > > > > > flags to indicate whether the qp destruction flow needs to wait > > > > > for these completions or not, if the destroy is called before > > > > > these operations began, the flows will check the flags and not > > > > > execute them ( connect / disconnect). > > > > > > > > > > We use completion structure for waiting for the completions > > > mentioned > > > > > above. > > > > > > > > > > The QP refcnt was modified to kref object. > > > > > The EP has a kref added to it to handle additional worker thread > > > > > accessing it. > > > > > > > > > > Memory Leaks - https://urldefense.proofpoint.com/v2/url?u=https- > > 3A__www.spinics.net_lists_linux- > > > 2Drdma_msg83762.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun > > NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV- > > q_oGHiJYpLrH4Igtg963UsuZs&s=XNxYMW- > > rrECcE1vRoUReaYZcb01cMCx9X9fs_clAU1Y&e= > > > > > Reported-by Chuck Lever <chuck.lever@xxxxxxxxxx> > > > > > > > > > > Concurrency not managed correctly - > > > > > https://urldefense.proofpoint.com/v2/url?u=https- > > 3A__www.spinics.net_lists_linux- > > > 2Drdma_msg67949.html&d=DwIBAg&c=nKjWec2b6R0mOyPaz7xtfQ&r=7Yun > > NpwaTtA-c31OjGDRlmf54csBCXY_j41vn-0xRz4&m=DfexSyvBpCQbEUyzV- > > > q_oGHiJYpLrH4Igtg963UsuZs&s=zgMS6HLPvQCHcCwmmXuA8EqpZV1cX_DL- > > oPqtLJv2vs&e= > > > > > Reported-by Jason Gunthorpe <jgg@xxxxxxxx> > > > > > > > > > > Signed-off-by: Ariel Elior <ariel.elior@xxxxxxxxxxx> > > > > > Signed-off-by: Michal Kalderon <michal.kalderon@xxxxxxxxxxx> > > > > > drivers/infiniband/hw/qedr/qedr.h | 23 ++++- > > > > > drivers/infiniband/hw/qedr/qedr_iw_cm.c | 150 > > ++++++++++++++++++++++---------- > > > > > drivers/infiniband/hw/qedr/verbs.c | 42 +++++---- > > > > > 3 files changed, 141 insertions(+), 74 deletions(-) > > > > > > > > > > diff --git a/drivers/infiniband/hw/qedr/qedr.h > > b/drivers/infiniband/hw/qedr/qedr.h > > > > > index 0cfd849b13d6..8e927f6c1520 100644 > > > > > +++ b/drivers/infiniband/hw/qedr/qedr.h > > > > > @@ -40,6 +40,7 @@ > > > > > #include <linux/qed/qed_rdma_if.h> > > > > > #include <linux/qed/qede_rdma.h> > > > > > #include <linux/qed/roce_common.h> > > > > > +#include <linux/completion.h> > > > > > #include "qedr_hsi_rdma.h" > > > > > > > > > > #define QEDR_NODE_DESC "QLogic 579xx RoCE HCA" > > > > > @@ -377,10 +378,20 @@ enum qedr_qp_err_bitmap { > > > > > QEDR_QP_ERR_RQ_PBL_FULL = 32, > > > > > }; > > > > > > > > > > +enum qedr_qp_create_type { > > > > > + QEDR_QP_CREATE_NONE, > > > > > + QEDR_QP_CREATE_USER, > > > > > + QEDR_QP_CREATE_KERNEL, > > > > > +}; > > > > > + > > > > > +enum qedr_iwarp_cm_flags { > > > > > + QEDR_IWARP_CM_WAIT_FOR_CONNECT = BIT(0), > > > > > + QEDR_IWARP_CM_WAIT_FOR_DISCONNECT = BIT(1), > > > > > +}; > > > > > + > > > > > struct qedr_qp { > > > > > struct ib_qp ibqp; /* must be first */ > > > > > struct qedr_dev *dev; > > > > > - struct qedr_iw_ep *ep; > > > > > struct qedr_qp_hwq_info sq; > > > > > struct qedr_qp_hwq_info rq; > > > > > > > > > > @@ -395,6 +406,7 @@ struct qedr_qp { > > > > > u32 id; > > > > > struct qedr_pd *pd; > > > > > enum ib_qp_type qp_type; > > > > > + enum qedr_qp_create_type create_type; > > > > > struct qed_rdma_qp *qed_qp; > > > > > u32 qp_id; > > > > > u16 icid; > > > > > @@ -437,8 +449,11 @@ struct qedr_qp { > > > > > /* Relevant to qps created from user space only (applications) */ > > > > > struct qedr_userq usq; > > > > > struct qedr_userq urq; > > > > > - atomic_t refcnt; > > > > > - bool destroyed; > > > > > + > > > > > + /* synchronization objects used with iwarp ep */ > > > > > + struct kref refcnt; > > > > > + struct completion iwarp_cm_comp; > > > > > + unsigned long iwarp_cm_flags; /* enum iwarp_cm_flags */ > > > > > }; > > > > > > > > > > struct qedr_ah { > > > > > @@ -531,7 +546,7 @@ struct qedr_iw_ep { > > > > > struct iw_cm_id *cm_id; > > > > > struct qedr_qp *qp; > > > > > void *qed_context; > > > > > - u8 during_connect; > > > > > + struct kref refcnt; > > > > > }; > > > > > > > > > > static inline > > > > > diff --git a/drivers/infiniband/hw/qedr/qedr_iw_cm.c > > b/drivers/infiniband/hw/qedr/qedr_iw_cm.c > > > > > index 22881d4442b9..26204caf0975 100644 > > > > > +++ b/drivers/infiniband/hw/qedr/qedr_iw_cm.c > > > > > @@ -79,6 +79,28 @@ qedr_fill_sockaddr6(const struct > > > qed_iwarp_cm_info > > *cm_info, > > > > > } > > > > > } > > > > > > > > > > +static void qedr_iw_free_qp(struct kref *ref) > > > > > +{ > > > > > + struct qedr_qp *qp = container_of(ref, struct qedr_qp, refcnt); > > > > > + > > > > > + xa_erase(&qp->dev->qps, qp->qp_id); > > > > > > > > This probably doesn't work right because qp_id is not derived from the > > > > xa_array, but some external entity. > > > > > > > > Thus the qp_id should be removed from the xarray and kref put'd right > > > > before the qp_id is deallocated from the external manager. > > > Thanks! This is a good point. > I will remove the element from xarray and kref put before deallocation, as > you mentioned, But will wait for the kref-free function to be called ( perhaps > using completion ) Before deallocation. Hi Jason, Actually I was wrong about this one. I can't wait for the kref-free function to be called, As it requires the deallocation flow to complete for ep->qp reference to decrease. I think I can simply call xa_erase before the deallocation flow and not as part of the Kref release function and call kref_put for the qp object at the end of the flow after everything is freed so that qp can be freed. > > > > > > > Ie you want to avoid a race where the qp_id can be re-assigned but the > > > > xarray entry hasn't been freed by its kref yet. Then it would > > > > xa_insert and fail. > > > > > > > > Also the xa_insert is probably not supposed to be the _irq version > Right, this was originally an idr which was modified with the massive patchset > That changed all idrs to xarrays. I'll re-review it. > > > > > either. > > > > > > > > > @@ -224,13 +249,18 @@ qedr_iw_disconnect_event(void *context, > > > > > struct qedr_discon_work *work; > > > > > struct qedr_iw_ep *ep = (struct qedr_iw_ep *)context; > > > > > struct qedr_dev *dev = ep->dev; > > > > > - struct qedr_qp *qp = ep->qp; > > > > > > > > > > work = kzalloc(sizeof(*work), GFP_ATOMIC); > > > > > if (!work) > > > > > return; > > > > > > > > > > - qedr_iw_qp_add_ref(&qp->ibqp); > > > > > + /* We can't get a close event before disconnect, but since > > > > > + * we're scheduling a work queue we need to make sure close > > > > > + * won't delete the ep, so we increase the refcnt > > > > > + */ > > > > > + if (!kref_get_unless_zero(&ep->refcnt)) > > > > > + return; > > > > > > > > The unless_zero version should not be used without some kind of > > > > locking like this, if you have a pointer to ep then ep must be a valid > > > > pointer and it is safe to take a kref on it. > > > > > > > > If there is a risk it is not valid then this is racy in a way that > > > > only locking can fix, not unless_zero > Ok, ep is valid here, I'll modify to kref_get. Thanks. > > > > > > > > > > @@ -476,6 +508,19 @@ qedr_addr6_resolve(struct qedr_dev *dev, > > > > > return rc; > > > > > } > > > > > > > > > > +struct qedr_qp *qedr_iw_load_qp(struct qedr_dev *dev, u32 qpn) > > > > > +{ > > > > > + struct qedr_qp *qp; > > > > > + > > > > > + xa_lock(&dev->qps); > > > > > + qp = xa_load(&dev->qps, qpn); > > > > > + if (!qp || !kref_get_unless_zero(&qp->refcnt)) > > > > > + qp = NULL; > > > > > > > > See, here is is OK because qp can't be freed under the xa_lock. > > > > > > > > However, this unless_zero also will not be needed once the xa_erase is > > > > moved to the right spot. > Right. Thanks. > > > > > > > > > > + /* Wait for the connection setup to complete */ > > > > > + if > > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_CONNECT, > > > > > + &qp->iwarp_cm_flags)) > > > > > + wait_for_completion(&qp->iwarp_cm_comp); > > > > > + > > > > > + if > > (test_and_set_bit(QEDR_IWARP_CM_WAIT_FOR_DISCONNECT, > > > > > + &qp->iwarp_cm_flags)) > > > > > + wait_for_completion(&qp->iwarp_cm_comp); > > > > > } > > > > > > > > These atomics seem mis-named, and I'm unclear how they can both be > > > > waiting on the same completion? > > In IWARP the connection establishment and disconnect are offloaded to hw > and asynchronous. > The first waits for CONNECT to complete. (completes asynchronously) If we > are in the middle of a connect that was offloaded (or after connect) the bit > will be on and completion will be completed once the connection is fully > established. > We want to wait before destroying the qp due to hw constraints (can't > destroy during connect). > > The seconds waits for DISCONNECT to complete ( doesn't always occur, only > if a graceful disconnect was initiated on either side) Disconnect can't occur on > a connection not established yet, so we can't get the completion of the > disconnect instead. > Similar for connect, once we start the disconnect we turn on the bit and will > complete once the disconnect completes. > > I didn't see a reason to use another completion structure, since from what I > read complete does comp->done++ and wait_for_completion checks done > and eventually does done-- so it can be used several times if there is no need > to distinguish which event occurred first (in this case there is only one option > first connect then disconnect and disconnect can't occur without connect) > > But if this is wrong I will add another completion structure. > > Thanks, > Michal > > > > > > > > > @@ -2490,11 +2488,11 @@ int qedr_destroy_qp(struct ib_qp *ibqp, > > > struct > > ib_udata *udata) > > > > > > > > > > qedr_free_qp_resources(dev, qp, udata); > > > > > > > > > > - if (atomic_dec_and_test(&qp->refcnt) && > > > > > - rdma_protocol_iwarp(&dev->ibdev, 1)) { > > > > > - xa_erase_irq(&dev->qps, qp->qp_id); > > > > > > > > This is probably too late, it should be done before the qp_id could be > > > > recycled. > > > > > > > > Jason