Re: [PATCH v3] IB/iser: Fix RNR errors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, 2018-04-30 at 13:31 +0300, Max Gurtovoy wrote:
> Jason/Doug/Leon,
> can you please apply this patch ?
> Sergey made the needed changes.
> 
> -Max.
> 
> On 4/16/2018 6:00 PM, Sergey Gorenko wrote:
> > Some users complain about RNR errors on the target, when heavy
> > high-priority tasks run on the initiator. After the
> > investigation, we found out that the receive WRs were exhausted,
> > because the initiator could not post them on time.
> > 
> > Receive work reqeusts are posted in chunks of min_posted_rx to
> > reduce the number of hits to the HCA. The WRs are posted in the
> > receive completion handler when the number of free receive buffers
> > reaches min_posted_rx. But on a high-loaded host, receive CQEs
> > processing can be delayed and all receive WRs will be exhausted.
> > In this case, the target will get an RNR error.
> > 
> > To avoid this, we post receive WR, as soon as at least one receive
> > buffer is freed. This increases the number of hits to the HCA, but
> > test results show that performance degradation is not significant.
> > 
> > Performance results running fio (8 jobs, 64 iodepth) using ramdisk
> > (w/w.o patch):
> > 
> > bs      IOPS(randread)    IOPS(randwrite)
> > ------  ---------------   ---------------
> > 512     329.4K / 340.3K   379.1K / 387.7K

3% performance drop

> > 1K      333.4K / 337.6K   364.2K / 370.0K

1.4% performance drop

> > 2K      321.1K / 323.5K   334.2K / 337.8K

.75% performance drop

I know you said the performance hit was not significant, and by the time
you get to 2k reads/writes, I agree with you, but at the low end, I'm
not sure you can call 3% "not significant".

Is a 3% performance hit better than the transient rnr error?  And is
this the only solution available?

> > 4K      300.7K / 302.9K   290.2K / 291.6K
> > 8K      235.9K / 237.1K   228.2K / 228.8K
> > 16K     176.3K / 177.0K   126.3K / 125.9K
> > 32K     115.2K / 115.4K    82.2K / 82.0K
> > 64K      70.7K / 70.7K     47.8K / 47.6K
> > 128K     38.5K / 38.6K     25.8K / 25.7K
> > 256K     20.7K / 20.7K     13.7K / 13.6K
> > 512K     10.0K / 10.0K      7.0K / 7.0K
> > 
> > Signed-off-by: Sergey Gorenko <sergeygo@xxxxxxxxxxxx>
> > Signed-off-by: Vladimir Neyelov <vladimirn@xxxxxxxxxxxx>
> > Reviewed-by: Max Gurtovoy <maxg@xxxxxxxxxxxx>
> > ---
> > Changes from v2:
> >   - Propagate return code in iser_post_rx_bufs() as suggested by Max.
> > Changes from v1:
> >   - Follow Leon's suggestion, the changelog is placed after "---" and
> >     after all Signed-off-by/Reviewed-by.
> > Changes from v0:
> >   - Follow Sagi's suggestion, iser_post_rx_bufs() is refactored. Removed
> >     unnecessary variable and centralized the exit from the function.
> > ---
> >   drivers/infiniband/ulp/iser/iscsi_iser.h     | 14 +-----
> >   drivers/infiniband/ulp/iser/iser_initiator.c | 64 ++++++++++++----------------
> >   drivers/infiniband/ulp/iser/iser_verbs.c     | 39 +++++------------
> >   3 files changed, 40 insertions(+), 77 deletions(-)
> > 
> > diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
> > index c1ae4aeae2f9..925996ddcf9f 100644
> > --- a/drivers/infiniband/ulp/iser/iscsi_iser.h
> > +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
> > @@ -123,8 +123,6 @@
> >   
> >   #define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)
> >   
> > -#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
> > -
> >   /* the max TX (send) WR supported by the iSER QP is defined by                 *
> >    * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
> >    * to have at max for SCSI command. The tx posting & completion handling code  *
> > @@ -452,9 +450,7 @@ struct iser_fr_pool {
> >    *
> >    * @cma_id:              rdma_cm connection maneger handle
> >    * @qp:                  Connection Queue-pair
> > - * @post_recv_buf_count: post receive counter
> >    * @sig_count:           send work request signal count
> > - * @rx_wr:               receive work request for batch posts
> >    * @device:              reference to iser device
> >    * @comp:                iser completion context
> >    * @fr_pool:             connection fast registration poool
> > @@ -463,9 +459,7 @@ struct iser_fr_pool {
> >   struct ib_conn {
> >   	struct rdma_cm_id           *cma_id;
> >   	struct ib_qp	            *qp;
> > -	int                          post_recv_buf_count;
> >   	u8                           sig_count;
> > -	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
> >   	struct iser_device          *device;
> >   	struct iser_comp	    *comp;
> >   	struct iser_fr_pool          fr_pool;
> > @@ -482,8 +476,6 @@ struct ib_conn {
> >    * @state:            connection logical state
> >    * @qp_max_recv_dtos: maximum number of data outs, corresponds
> >    *                    to max number of post recvs
> > - * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
> > - * @min_posted_rx:    (qp_max_recv_dtos >> 2)
> >    * @max_cmds:         maximum cmds allowed for this connection
> >    * @name:             connection peer portal
> >    * @release_work:     deffered work for release job
> > @@ -494,7 +486,6 @@ struct ib_conn {
> >    *                    (state is ISER_CONN_UP)
> >    * @conn_list:        entry in ig conn list
> >    * @login_desc:       login descriptor
> > - * @rx_desc_head:     head of rx_descs cyclic buffer
> >    * @rx_descs:         rx buffers array (cyclic buffer)
> >    * @num_rx_descs:     number of rx descriptors
> >    * @scsi_sg_tablesize: scsi host sg_tablesize
> > @@ -505,8 +496,6 @@ struct iser_conn {
> >   	struct iscsi_endpoint	     *ep;
> >   	enum iser_conn_state	     state;
> >   	unsigned		     qp_max_recv_dtos;
> > -	unsigned		     qp_max_recv_dtos_mask;
> > -	unsigned		     min_posted_rx;
> >   	u16                          max_cmds;
> >   	char 			     name[ISER_OBJECT_NAME_SIZE];
> >   	struct work_struct	     release_work;
> > @@ -516,7 +505,6 @@ struct iser_conn {
> >   	struct completion	     up_completion;
> >   	struct list_head	     conn_list;
> >   	struct iser_login_desc       login_desc;
> > -	unsigned int 		     rx_desc_head;
> >   	struct iser_rx_desc	     *rx_descs;
> >   	u32                          num_rx_descs;
> >   	unsigned short               scsi_sg_tablesize;
> > @@ -638,7 +626,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
> >   			    enum iser_data_dir cmd_dir);
> >   
> >   int  iser_post_recvl(struct iser_conn *iser_conn);
> > -int  iser_post_recvm(struct iser_conn *iser_conn, int count);
> > +int  iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc);
> >   int  iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
> >   		    bool signal);
> >   
> > diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
> > index df49c4eb67f7..5fc648f9afe1 100644
> > --- a/drivers/infiniband/ulp/iser/iser_initiator.c
> > +++ b/drivers/infiniband/ulp/iser/iser_initiator.c
> > @@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
> >   	struct iser_device *device = ib_conn->device;
> >   
> >   	iser_conn->qp_max_recv_dtos = session->cmds_max;
> > -	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
> > -	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
> >   
> >   	if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max,
> >   					   iser_conn->scsi_sg_tablesize))
> > @@ -279,7 +277,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
> >   		rx_sg->lkey = device->pd->local_dma_lkey;
> >   	}
> >   
> > -	iser_conn->rx_desc_head = 0;
> >   	return 0;
> >   
> >   rx_desc_dma_map_failed:
> > @@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn)
> >   static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
> >   {
> >   	struct iser_conn *iser_conn = conn->dd_data;
> > -	struct ib_conn *ib_conn = &iser_conn->ib_conn;
> >   	struct iscsi_session *session = conn->session;
> > +	int err = 0;
> > +	int i;
> >   
> >   	iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
> >   	/* check if this is the last login - going to full feature phase */
> >   	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
> > -		return 0;
> > -
> > -	/*
> > -	 * Check that there is one posted recv buffer
> > -	 * (for the last login response).
> > -	 */
> > -	WARN_ON(ib_conn->post_recv_buf_count != 1);
> > +		goto out;
> >   
> >   	if (session->discovery_sess) {
> >   		iser_info("Discovery session, re-using login RX buffer\n");
> > -		return 0;
> > -	} else
> > -		iser_info("Normal session, posting batch of RX %d buffers\n",
> > -			  iser_conn->min_posted_rx);
> > +		goto out;
> > +	}
> >   
> > -	/* Initial post receive buffers */
> > -	if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
> > -		return -ENOMEM;
> > +	iser_info("Normal session, posting batch of RX %d buffers\n",
> > +		  iser_conn->qp_max_recv_dtos - 1);
> >   
> > -	return 0;
> > +	/*
> > +	 * Initial post receive buffers.
> > +	 * There is one already posted recv buffer (for the last login
> > +	 * response). Therefore, the first recv buffer is skipped here.
> > +	 */
> > +	for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) {
> > +		err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]);
> > +		if (err)
> > +			goto out;
> > +	}
> > +out:
> > +	return err;
> >   }
> >   
> >   static inline bool iser_signal_comp(u8 sig_count)
> > @@ -585,7 +585,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc)
> >   				      desc->rsp_dma, ISER_RX_LOGIN_SIZE,
> >   				      DMA_FROM_DEVICE);
> >   
> > -	ib_conn->post_recv_buf_count--;
> > +	if (iser_conn->iscsi_conn->session->discovery_sess)
> > +		return;
> > +
> > +	/* Post the first RX buffer that is skipped in iser_post_rx_bufs() */
> > +	iser_post_recvm(iser_conn, iser_conn->rx_descs);
> >   }
> >   
> >   static inline void
> > @@ -645,8 +649,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
> >   	struct iser_conn *iser_conn = to_iser_conn(ib_conn);
> >   	struct iser_rx_desc *desc = iser_rx(wc->wr_cqe);
> >   	struct iscsi_hdr *hdr;
> > -	int length;
> > -	int outstanding, count, err;
> > +	int length, err;
> >   
> >   	if (unlikely(wc->status != IB_WC_SUCCESS)) {
> >   		iser_err_comp(wc, "task_rsp");
> > @@ -675,20 +678,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc)
> >   				      desc->dma_addr, ISER_RX_PAYLOAD_SIZE,
> >   				      DMA_FROM_DEVICE);
> >   
> > -	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
> > -	 * task eliminates the need to worry on tasks which are completed in   *
> > -	 * parallel to the execution of iser_conn_term. So the code that waits *
> > -	 * for the posted rx bufs refcount to become zero handles everything   */
> > -	ib_conn->post_recv_buf_count--;
> > -
> > -	outstanding = ib_conn->post_recv_buf_count;
> > -	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
> > -		count = min(iser_conn->qp_max_recv_dtos - outstanding,
> > -			    iser_conn->min_posted_rx);
> > -		err = iser_post_recvm(iser_conn, count);
> > -		if (err)
> > -			iser_err("posting %d rx bufs err %d\n", count, err);
> > -	}
> > +	err = iser_post_recvm(iser_conn, desc);
> > +	if (err)
> > +		iser_err("posting rx buffer err %d\n", err);
> >   }
> >   
> >   void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc)
> > diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
> > index 56b7240a3fc3..2019500a2425 100644
> > --- a/drivers/infiniband/ulp/iser/iser_verbs.c
> > +++ b/drivers/infiniband/ulp/iser/iser_verbs.c
> > @@ -936,7 +936,6 @@ void iser_conn_init(struct iser_conn *iser_conn)
> >   	INIT_LIST_HEAD(&iser_conn->conn_list);
> >   	mutex_init(&iser_conn->state_mutex);
> >   
> > -	ib_conn->post_recv_buf_count = 0;
> >   	ib_conn->reg_cqe.done = iser_reg_comp;
> >   }
> >   
> > @@ -1020,44 +1019,28 @@ int iser_post_recvl(struct iser_conn *iser_conn)
> >   	wr.num_sge = 1;
> >   	wr.next = NULL;
> >   
> > -	ib_conn->post_recv_buf_count++;
> >   	ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
> > -	if (ib_ret) {
> > +	if (ib_ret)
> >   		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
> > -		ib_conn->post_recv_buf_count--;
> > -	}
> >   
> >   	return ib_ret;
> >   }
> >   
> > -int iser_post_recvm(struct iser_conn *iser_conn, int count)
> > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc)
> >   {
> >   	struct ib_conn *ib_conn = &iser_conn->ib_conn;
> > -	unsigned int my_rx_head = iser_conn->rx_desc_head;
> > -	struct iser_rx_desc *rx_desc;
> > -	struct ib_recv_wr *wr, *wr_failed;
> > -	int i, ib_ret;
> > -
> > -	for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) {
> > -		rx_desc = &iser_conn->rx_descs[my_rx_head];
> > -		rx_desc->cqe.done = iser_task_rsp;
> > -		wr->wr_cqe = &rx_desc->cqe;
> > -		wr->sg_list = &rx_desc->rx_sg;
> > -		wr->num_sge = 1;
> > -		wr->next = wr + 1;
> > -		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
> > -	}
> > +	struct ib_recv_wr wr, *wr_failed;
> > +	int ib_ret;
> >   
> > -	wr--;
> > -	wr->next = NULL; /* mark end of work requests list */
> > +	rx_desc->cqe.done = iser_task_rsp;
> > +	wr.wr_cqe = &rx_desc->cqe;
> > +	wr.sg_list = &rx_desc->rx_sg;
> > +	wr.num_sge = 1;
> > +	wr.next = NULL;
> >   
> > -	ib_conn->post_recv_buf_count += count;
> > -	ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed);
> > -	if (ib_ret) {
> > +	ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed);
> > +	if (ib_ret)
> >   		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
> > -		ib_conn->post_recv_buf_count -= count;
> > -	} else
> > -		iser_conn->rx_desc_head = my_rx_head;
> >   
> >   	return ib_ret;
> >   }
> > 

-- 
Doug Ledford <dledford@xxxxxxxxxx>
    GPG KeyID: B826A3330E572FDD
    Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD

Attachment: signature.asc
Description: This is a digitally signed message part


[Index of Archives]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Photo]     [Yosemite News]     [Yosemite Photos]     [Linux Kernel]     [Linux SCSI]     [XFree86]

  Powered by Linux