On Mon, 2018-04-30 at 13:31 +0300, Max Gurtovoy wrote: > Jason/Doug/Leon, > can you please apply this patch ? > Sergey made the needed changes. > > -Max. > > On 4/16/2018 6:00 PM, Sergey Gorenko wrote: > > Some users complain about RNR errors on the target, when heavy > > high-priority tasks run on the initiator. After the > > investigation, we found out that the receive WRs were exhausted, > > because the initiator could not post them on time. > > > > Receive work reqeusts are posted in chunks of min_posted_rx to > > reduce the number of hits to the HCA. The WRs are posted in the > > receive completion handler when the number of free receive buffers > > reaches min_posted_rx. But on a high-loaded host, receive CQEs > > processing can be delayed and all receive WRs will be exhausted. > > In this case, the target will get an RNR error. > > > > To avoid this, we post receive WR, as soon as at least one receive > > buffer is freed. This increases the number of hits to the HCA, but > > test results show that performance degradation is not significant. > > > > Performance results running fio (8 jobs, 64 iodepth) using ramdisk > > (w/w.o patch): > > > > bs IOPS(randread) IOPS(randwrite) > > ------ --------------- --------------- > > 512 329.4K / 340.3K 379.1K / 387.7K 3% performance drop > > 1K 333.4K / 337.6K 364.2K / 370.0K 1.4% performance drop > > 2K 321.1K / 323.5K 334.2K / 337.8K .75% performance drop I know you said the performance hit was not significant, and by the time you get to 2k reads/writes, I agree with you, but at the low end, I'm not sure you can call 3% "not significant". Is a 3% performance hit better than the transient rnr error? And is this the only solution available? > > 4K 300.7K / 302.9K 290.2K / 291.6K > > 8K 235.9K / 237.1K 228.2K / 228.8K > > 16K 176.3K / 177.0K 126.3K / 125.9K > > 32K 115.2K / 115.4K 82.2K / 82.0K > > 64K 70.7K / 70.7K 47.8K / 47.6K > > 128K 38.5K / 38.6K 25.8K / 25.7K > > 256K 20.7K / 20.7K 13.7K / 13.6K > > 512K 10.0K / 10.0K 7.0K / 7.0K > > > > Signed-off-by: Sergey Gorenko <sergeygo@xxxxxxxxxxxx> > > Signed-off-by: Vladimir Neyelov <vladimirn@xxxxxxxxxxxx> > > Reviewed-by: Max Gurtovoy <maxg@xxxxxxxxxxxx> > > --- > > Changes from v2: > > - Propagate return code in iser_post_rx_bufs() as suggested by Max. > > Changes from v1: > > - Follow Leon's suggestion, the changelog is placed after "---" and > > after all Signed-off-by/Reviewed-by. > > Changes from v0: > > - Follow Sagi's suggestion, iser_post_rx_bufs() is refactored. Removed > > unnecessary variable and centralized the exit from the function. > > --- > > drivers/infiniband/ulp/iser/iscsi_iser.h | 14 +----- > > drivers/infiniband/ulp/iser/iser_initiator.c | 64 ++++++++++++---------------- > > drivers/infiniband/ulp/iser/iser_verbs.c | 39 +++++------------ > > 3 files changed, 40 insertions(+), 77 deletions(-) > > > > diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h > > index c1ae4aeae2f9..925996ddcf9f 100644 > > --- a/drivers/infiniband/ulp/iser/iscsi_iser.h > > +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h > > @@ -123,8 +123,6 @@ > > > > #define ISER_QP_MAX_RECV_DTOS (ISER_DEF_XMIT_CMDS_MAX) > > > > -#define ISER_MIN_POSTED_RX (ISER_DEF_XMIT_CMDS_MAX >> 2) > > - > > /* the max TX (send) WR supported by the iSER QP is defined by * > > * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect * > > * to have at max for SCSI command. The tx posting & completion handling code * > > @@ -452,9 +450,7 @@ struct iser_fr_pool { > > * > > * @cma_id: rdma_cm connection maneger handle > > * @qp: Connection Queue-pair > > - * @post_recv_buf_count: post receive counter > > * @sig_count: send work request signal count > > - * @rx_wr: receive work request for batch posts > > * @device: reference to iser device > > * @comp: iser completion context > > * @fr_pool: connection fast registration poool > > @@ -463,9 +459,7 @@ struct iser_fr_pool { > > struct ib_conn { > > struct rdma_cm_id *cma_id; > > struct ib_qp *qp; > > - int post_recv_buf_count; > > u8 sig_count; > > - struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; > > struct iser_device *device; > > struct iser_comp *comp; > > struct iser_fr_pool fr_pool; > > @@ -482,8 +476,6 @@ struct ib_conn { > > * @state: connection logical state > > * @qp_max_recv_dtos: maximum number of data outs, corresponds > > * to max number of post recvs > > - * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1) > > - * @min_posted_rx: (qp_max_recv_dtos >> 2) > > * @max_cmds: maximum cmds allowed for this connection > > * @name: connection peer portal > > * @release_work: deffered work for release job > > @@ -494,7 +486,6 @@ struct ib_conn { > > * (state is ISER_CONN_UP) > > * @conn_list: entry in ig conn list > > * @login_desc: login descriptor > > - * @rx_desc_head: head of rx_descs cyclic buffer > > * @rx_descs: rx buffers array (cyclic buffer) > > * @num_rx_descs: number of rx descriptors > > * @scsi_sg_tablesize: scsi host sg_tablesize > > @@ -505,8 +496,6 @@ struct iser_conn { > > struct iscsi_endpoint *ep; > > enum iser_conn_state state; > > unsigned qp_max_recv_dtos; > > - unsigned qp_max_recv_dtos_mask; > > - unsigned min_posted_rx; > > u16 max_cmds; > > char name[ISER_OBJECT_NAME_SIZE]; > > struct work_struct release_work; > > @@ -516,7 +505,6 @@ struct iser_conn { > > struct completion up_completion; > > struct list_head conn_list; > > struct iser_login_desc login_desc; > > - unsigned int rx_desc_head; > > struct iser_rx_desc *rx_descs; > > u32 num_rx_descs; > > unsigned short scsi_sg_tablesize; > > @@ -638,7 +626,7 @@ void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, > > enum iser_data_dir cmd_dir); > > > > int iser_post_recvl(struct iser_conn *iser_conn); > > -int iser_post_recvm(struct iser_conn *iser_conn, int count); > > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc); > > int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, > > bool signal); > > > > diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c > > index df49c4eb67f7..5fc648f9afe1 100644 > > --- a/drivers/infiniband/ulp/iser/iser_initiator.c > > +++ b/drivers/infiniband/ulp/iser/iser_initiator.c > > @@ -247,8 +247,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > > struct iser_device *device = ib_conn->device; > > > > iser_conn->qp_max_recv_dtos = session->cmds_max; > > - iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */ > > - iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2; > > > > if (device->reg_ops->alloc_reg_res(ib_conn, session->scsi_cmds_max, > > iser_conn->scsi_sg_tablesize)) > > @@ -279,7 +277,6 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, > > rx_sg->lkey = device->pd->local_dma_lkey; > > } > > > > - iser_conn->rx_desc_head = 0; > > return 0; > > > > rx_desc_dma_map_failed: > > @@ -322,32 +319,35 @@ void iser_free_rx_descriptors(struct iser_conn *iser_conn) > > static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req) > > { > > struct iser_conn *iser_conn = conn->dd_data; > > - struct ib_conn *ib_conn = &iser_conn->ib_conn; > > struct iscsi_session *session = conn->session; > > + int err = 0; > > + int i; > > > > iser_dbg("req op %x flags %x\n", req->opcode, req->flags); > > /* check if this is the last login - going to full feature phase */ > > if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE) > > - return 0; > > - > > - /* > > - * Check that there is one posted recv buffer > > - * (for the last login response). > > - */ > > - WARN_ON(ib_conn->post_recv_buf_count != 1); > > + goto out; > > > > if (session->discovery_sess) { > > iser_info("Discovery session, re-using login RX buffer\n"); > > - return 0; > > - } else > > - iser_info("Normal session, posting batch of RX %d buffers\n", > > - iser_conn->min_posted_rx); > > + goto out; > > + } > > > > - /* Initial post receive buffers */ > > - if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx)) > > - return -ENOMEM; > > + iser_info("Normal session, posting batch of RX %d buffers\n", > > + iser_conn->qp_max_recv_dtos - 1); > > > > - return 0; > > + /* > > + * Initial post receive buffers. > > + * There is one already posted recv buffer (for the last login > > + * response). Therefore, the first recv buffer is skipped here. > > + */ > > + for (i = 1; i < iser_conn->qp_max_recv_dtos; i++) { > > + err = iser_post_recvm(iser_conn, &iser_conn->rx_descs[i]); > > + if (err) > > + goto out; > > + } > > +out: > > + return err; > > } > > > > static inline bool iser_signal_comp(u8 sig_count) > > @@ -585,7 +585,11 @@ void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) > > desc->rsp_dma, ISER_RX_LOGIN_SIZE, > > DMA_FROM_DEVICE); > > > > - ib_conn->post_recv_buf_count--; > > + if (iser_conn->iscsi_conn->session->discovery_sess) > > + return; > > + > > + /* Post the first RX buffer that is skipped in iser_post_rx_bufs() */ > > + iser_post_recvm(iser_conn, iser_conn->rx_descs); > > } > > > > static inline void > > @@ -645,8 +649,7 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > > struct iser_conn *iser_conn = to_iser_conn(ib_conn); > > struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); > > struct iscsi_hdr *hdr; > > - int length; > > - int outstanding, count, err; > > + int length, err; > > > > if (unlikely(wc->status != IB_WC_SUCCESS)) { > > iser_err_comp(wc, "task_rsp"); > > @@ -675,20 +678,9 @@ void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) > > desc->dma_addr, ISER_RX_PAYLOAD_SIZE, > > DMA_FROM_DEVICE); > > > > - /* decrementing conn->post_recv_buf_count only --after-- freeing the * > > - * task eliminates the need to worry on tasks which are completed in * > > - * parallel to the execution of iser_conn_term. So the code that waits * > > - * for the posted rx bufs refcount to become zero handles everything */ > > - ib_conn->post_recv_buf_count--; > > - > > - outstanding = ib_conn->post_recv_buf_count; > > - if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { > > - count = min(iser_conn->qp_max_recv_dtos - outstanding, > > - iser_conn->min_posted_rx); > > - err = iser_post_recvm(iser_conn, count); > > - if (err) > > - iser_err("posting %d rx bufs err %d\n", count, err); > > - } > > + err = iser_post_recvm(iser_conn, desc); > > + if (err) > > + iser_err("posting rx buffer err %d\n", err); > > } > > > > void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) > > diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c > > index 56b7240a3fc3..2019500a2425 100644 > > --- a/drivers/infiniband/ulp/iser/iser_verbs.c > > +++ b/drivers/infiniband/ulp/iser/iser_verbs.c > > @@ -936,7 +936,6 @@ void iser_conn_init(struct iser_conn *iser_conn) > > INIT_LIST_HEAD(&iser_conn->conn_list); > > mutex_init(&iser_conn->state_mutex); > > > > - ib_conn->post_recv_buf_count = 0; > > ib_conn->reg_cqe.done = iser_reg_comp; > > } > > > > @@ -1020,44 +1019,28 @@ int iser_post_recvl(struct iser_conn *iser_conn) > > wr.num_sge = 1; > > wr.next = NULL; > > > > - ib_conn->post_recv_buf_count++; > > ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > > - if (ib_ret) { > > + if (ib_ret) > > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > > - ib_conn->post_recv_buf_count--; > > - } > > > > return ib_ret; > > } > > > > -int iser_post_recvm(struct iser_conn *iser_conn, int count) > > +int iser_post_recvm(struct iser_conn *iser_conn, struct iser_rx_desc *rx_desc) > > { > > struct ib_conn *ib_conn = &iser_conn->ib_conn; > > - unsigned int my_rx_head = iser_conn->rx_desc_head; > > - struct iser_rx_desc *rx_desc; > > - struct ib_recv_wr *wr, *wr_failed; > > - int i, ib_ret; > > - > > - for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { > > - rx_desc = &iser_conn->rx_descs[my_rx_head]; > > - rx_desc->cqe.done = iser_task_rsp; > > - wr->wr_cqe = &rx_desc->cqe; > > - wr->sg_list = &rx_desc->rx_sg; > > - wr->num_sge = 1; > > - wr->next = wr + 1; > > - my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; > > - } > > + struct ib_recv_wr wr, *wr_failed; > > + int ib_ret; > > > > - wr--; > > - wr->next = NULL; /* mark end of work requests list */ > > + rx_desc->cqe.done = iser_task_rsp; > > + wr.wr_cqe = &rx_desc->cqe; > > + wr.sg_list = &rx_desc->rx_sg; > > + wr.num_sge = 1; > > + wr.next = NULL; > > > > - ib_conn->post_recv_buf_count += count; > > - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); > > - if (ib_ret) { > > + ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); > > + if (ib_ret) > > iser_err("ib_post_recv failed ret=%d\n", ib_ret); > > - ib_conn->post_recv_buf_count -= count; > > - } else > > - iser_conn->rx_desc_head = my_rx_head; > > > > return ib_ret; > > } > > -- Doug Ledford <dledford@xxxxxxxxxx> GPG KeyID: B826A3330E572FDD Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD
Attachment:
signature.asc
Description: This is a digitally signed message part