On Tue, Dec 03, 2024 at 03:52:21PM +0200, Leon Romanovsky wrote: > Thus, replace with a state machine as the following: > * IB_MAD_STATE_SEND_START - MAD was sent to the QP and is waiting > for completion notification > * IB_MAD_STATE_WAIT_RESP - MAD send completed successfully, waiting > for a response > * IB_MAD_STATE_EARLY_RESP - Response came early, before send > completion notification > * IB_MAD_STATE_DONE - MAD processing completed This is a good idea, that refcounting stuff is totally inscrutable. > @@ -1773,9 +1772,13 @@ ib_find_send_mad(const struct ib_mad_agent_private *mad_agent_priv, > void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr) > { > mad_send_wr->timeout = 0; > - if (mad_send_wr->refcount == 1) > + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) { > + mad_send_wr->state = IB_MAD_STATE_DONE; > list_move_tail(&mad_send_wr->agent_list, > &mad_send_wr->mad_agent_priv->done_list); > + } else { > + mad_send_wr->state = IB_MAD_STATE_EARLY_RESP; Let's be even more explicit on all these transitions: WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START); Maybe with some macro compiled out unless lockdep is on or something. But there are clear rules here how the FSM should work, lets document and check them. Let me try to guess what they should be at each point.. > static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, > @@ -2195,6 +2198,7 @@ static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr) > list_item = &mad_agent_priv->wait_list; > } > > + mad_send_wr->state = IB_MAD_STATE_WAIT_RESP; > list_add(&mad_send_wr->agent_list, list_item); WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START); > @@ -2222,6 +2226,11 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, > > mad_agent_priv = mad_send_wr->mad_agent_priv; > spin_lock_irqsave(&mad_agent_priv->lock, flags); > + if (mad_send_wr->state == IB_MAD_STATE_EARLY_RESP) { > + mad_send_wr->state = IB_MAD_STATE_DONE; > + goto done; > + } else WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START); This flow doesn't seem to touch the agent_list? If we got to EARLY_RESP then mad_done didn't remove it from the agent list and we goto done which skips it? Seems like a bug? > @@ -2232,14 +2241,10 @@ void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr, > if (mad_send_wc->status != IB_WC_SUCCESS && > mad_send_wr->status == IB_WC_SUCCESS) { > mad_send_wr->status = mad_send_wc->status; mad_send_wr->state = IB_MAD_STATE_DONE ?? > - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); > - } > - > - if (--mad_send_wr->refcount > 0) { > - if (mad_send_wr->refcount == 1 && mad_send_wr->timeout && > - mad_send_wr->status == IB_WC_SUCCESS) { > - wait_for_response(mad_send_wr); > - } > + } else if (mad_send_wr->status == IB_WC_SUCCESS && > + mad_send_wr->timeout && > + mad_send_wr->state == IB_MAD_STATE_SEND_START) { > + wait_for_response(mad_send_wr); > goto done; > } This sequence looks a little off, if we got a EARLY_RESP then we skip everything else. It is possible for the send WC to indicate fail even if we got a matching response and all that checking got skipped. Maybe like this: if (mad_send_wc->status != IB_WC_SUCCESS && mad_send_wr->status == IB_WC_SUCCESS) { mad_send_wr->status = mad_send_wc->status; WARN_ON(mad_send_wr->state != IB_MAD_STATE_EARLY_RESP && mad_send_wr->state != IB_MAD_STATE_SEND_START); } else if (mad_send_wr->status == IB_WC_SUCCESS && mad_send_wr->timeout) { if (mad_send_wr->state == IB_MAD_STATE_SEND_START) { wait_for_response(mad_send_wr); goto done; } WARN_ON(mad_send_wr->state == IB_MAD_STATE_EARLY_RESP); } /* Remove send from MAD agent and notify client of completion */ mad_send_wr->state = IB_MAD_STATE_DONE; list_del(&mad_send_wr->agent_list); > @@ -2407,12 +2412,9 @@ static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) > > spin_lock_irqsave(&mad_agent_priv->lock, flags); > list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr, > - &mad_agent_priv->send_list, agent_list) { > - if (mad_send_wr->status == IB_WC_SUCCESS) { > + &mad_agent_priv->send_list, agent_list) > + if (mad_send_wr->status == IB_WC_SUCCESS) > mad_send_wr->status = IB_WC_WR_FLUSH_ERR; > - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); > - } That status check feels like it should be a state check? Should it be a new state instead of storing status? What states are valid here anyhow? IB_MAD_STATE_SEND_START and IB_MAD_STATE_EARLY_RESP I guess? > @@ -2459,7 +2461,6 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) > struct ib_mad_agent_private *mad_agent_priv; > struct ib_mad_send_wr_private *mad_send_wr; > unsigned long flags; > - int active; > > if (!send_buf) > return -EINVAL; > @@ -2473,14 +2474,11 @@ int ib_modify_mad(struct ib_mad_send_buf *send_buf, u32 timeout_ms) > return -EINVAL; > } > > - active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1); > - if (!timeout_ms) { > + if (!timeout_ms) > mad_send_wr->status = IB_WC_WR_FLUSH_ERR; > - mad_send_wr->refcount -= (mad_send_wr->timeout > 0); > - } Here again, should FLUSH_ERR be its own state? This is basically canceling the mad right? Then why do we reset the timeout? > mad_send_wr->send_buf.timeout_ms = timeout_ms; > - if (active) > + if (mad_send_wr->state == IB_MAD_STATE_SEND_START) > mad_send_wr->timeout = msecs_to_jiffies(timeout_ms); > else > ib_reset_mad_timeout(mad_send_wr, timeout_ms); So this else is IB_MAD_STATE_WAIT_RESP ? > @@ -2607,7 +2605,7 @@ static int retry_send(struct ib_mad_send_wr_private *mad_send_wr) > ret = ib_send_mad(mad_send_wr); > > if (!ret) { > - mad_send_wr->refcount++; > + mad_send_wr->state = IB_MAD_STATE_SEND_START; WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP); ? Does the caller reliably ensure this? > diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c > index 8af0619a39cd..dff264e9bb68 100644 > --- a/drivers/infiniband/core/mad_rmpp.c > +++ b/drivers/infiniband/core/mad_rmpp.c > @@ -717,13 +717,13 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, > ib_mad_complete_send_wr(mad_send_wr, &wc); > return; > } > - if (mad_send_wr->refcount == 1) > + if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP) > ib_reset_mad_timeout(mad_send_wr, > mad_send_wr->send_buf.timeout_ms); else WARN_ON(mad_send_wr->state != IB_MAD_STATE_SEND_START); > spin_unlock_irqrestore(&agent->lock, flags); > ack_ds_ack(agent, mad_recv_wc); > return; > - } else if (mad_send_wr->refcount == 1 && > + } else if (mad_send_wr->state == IB_MAD_STATE_WAIT_RESP && > mad_send_wr->seg_num < mad_send_wr->newwin && > mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) { > /* Send failure will just result in a timeout/retry */ > @@ -731,7 +731,7 @@ static void process_rmpp_ack(struct ib_mad_agent_private *agent, > if (ret) > goto out; > > - mad_send_wr->refcount++; > + mad_send_wr->state = IB_MAD_STATE_SEND_START; WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP); ? > list_move_tail(&mad_send_wr->agent_list, > &mad_send_wr->mad_agent_priv->send_list); > } > @@ -890,7 +890,6 @@ int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr) > mad_send_wr->newwin = init_newwin(mad_send_wr); > > /* We need to wait for the final ACK even if there isn't a response */ > - mad_send_wr->refcount += (mad_send_wr->timeout == 0); WARN_ON(mad_send_wr->state != IB_MAD_STATE_WAIT_RESP); ?? Jason