This patch introduces custom events scheduler, non-blocking epoll_wait when events are pending, delaying IB completions notifications, that leads to significant reduction in interrupts rate for iser/ib, while adding flexibility to tgtd event processing scheme. Signed-off-by: Alexander Nezhinsky <nezhinsky@xxxxxxxxx> --- usr/iscsi/iscsi_rdma.c | 240 +++++++++++++++++++++++++++--------------------- usr/tgtd.c | 136 ++++++++++++--------------- usr/tgtd.h | 30 +++++- 3 files changed, 219 insertions(+), 187 deletions(-) diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c index 46e6ea8..35b0f13 100644 --- a/usr/iscsi/iscsi_rdma.c +++ b/usr/iscsi/iscsi_rdma.c @@ -144,6 +144,8 @@ struct conn_info { /* but count so we can drain CQ on close */ int recvl_posted; + struct tgt_event tx_sched; + /* login phase resources, freed at full-feature */ void *srbuf_login; void *listbuf_login; @@ -194,6 +196,8 @@ struct iser_device { void *mempool_listbuf; struct ibv_mr *mempool_mr; + struct tgt_event poll_sched; + /* free and allocated mempool entries */ struct list_head mempool_free, mempool_alloc; }; @@ -217,10 +221,6 @@ static struct list_head iser_conn_list; /* if any task needs an rdma read or write slot to proceed */ static int waiting_rdma_slot; -/* progress available, used with tgt_counter_event */ -static int num_tx_ready; -static int num_rx_ready; - #define uint64_from_ptr(p) (uint64_t)(uintptr_t)(p) #define ptr_from_int64(p) (void *)(unsigned long)(p) @@ -270,13 +270,17 @@ static inline struct conn_info *RDMA_CONN(struct iscsi_connection *conn) return container_of(conn, struct conn_info, iscsi_conn); } -static void iser_cqe_handler(int fd, int events, void *data); -static void iser_rx_progress(int *counter, void *data); +static void iser_cqe_handler(int fd __attribute__((unused)), + int events __attribute__((unused)), + void *data); static void iser_rdma_read_completion(struct rdmalist *rdma); static void iscsi_rdma_release(struct iscsi_connection *conn); static int iscsi_rdma_show(struct iscsi_connection *conn, char *buf, int rest); static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events); +static void iser_sched_poll_cq(struct tgt_event *tev); +static void iser_sched_drain_cq(struct tgt_event *tev); +static void iser_sched_tx(struct tgt_event *evt); /* * Called when ready for full feature, builds resources. @@ -612,6 +616,8 @@ static int iser_device_init(struct iser_device *dev) goto out; } + tgt_init_sched_event(&dev->poll_sched,iser_sched_poll_cq,dev); + ret = ibv_req_notify_cq(dev->cq, 0); if (ret) { eprintf("ibv_req_notify failed: %s\n", strerror(ret)); @@ -691,7 +697,10 @@ static void iser_accept_connection(struct rdma_cm_event *event) ci->login_phase = LOGIN_PHASE_START; INIT_LIST_HEAD(&ci->conn_tx_ready); list_add(&ci->iser_conn_list, &temp_conn); - /* initiator sits at dst, we are src */ + + tgt_init_sched_event(&ci->tx_sched,iser_sched_tx,ci); + + /* initiator sits at dst, we are src */ memcpy(&ci->peer_addr, &event->id->route.addr.dst_addr, sizeof(ci->peer_addr)); memcpy(&ci->self_addr, &event->id->route.addr.src_addr, @@ -940,7 +949,7 @@ static void handle_wc(struct ibv_wc *wc) list_add(&rdmal->list, &ci->rdmal); if (waiting_rdma_slot) { waiting_rdma_slot = 0; - num_tx_ready = 1; + tgt_add_sched_event(&ci->tx_sched); } break; @@ -957,7 +966,7 @@ static void handle_wc(struct ibv_wc *wc) list_add(&rdmal->list, &ci->rdmal); if (waiting_rdma_slot) { waiting_rdma_slot = 0; - num_tx_ready = 1; + tgt_add_sched_event(&ci->tx_sched); } break; @@ -974,85 +983,14 @@ close_err: } /* - * Called directly from main event loop when a CQ notification is - * available. - */ -static void iser_cqe_handler(int fd __attribute__((unused)), - int events __attribute__((unused)), - void *data) -{ - int ret; - void *cq_context; - struct iser_device *dev = data; - - ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context); - if (ret != 0) { - eprintf("notification, but no CQ event\n"); - exit(1); - } - - ibv_ack_cq_events(dev->cq, 1); - - ret = ibv_req_notify_cq(dev->cq, 0); - if (ret) { - eprintf("ibv_req_notify_cq: %s\n", strerror(ret)); - exit(1); - } - - iser_rx_progress(NULL, dev); -} - -/* - * Called from tgtd when num_tx_ready (counter) non-zero. Walks the - * list of active connections and tries to push tx on each, until nothing - * is ready anymore. No progress limit here. - */ -static void iser_tx_progress(int *counter __attribute__((unused)), - void *data __attribute__((unused))) -{ - int reloop, ret; - struct conn_info *ci, *cin; - struct iscsi_connection *conn; - - dprintf("entry\n"); - num_tx_ready = 0; - - do { - reloop = 0; - list_for_each_entry_safe(ci, cin, &conn_tx_ready, - conn_tx_ready) { - conn = &ci->iscsi_conn; - if (conn->state == STATE_CLOSE) { - dprintf("ignoring tx for closed conn\n"); - } else { - dprintf("trying tx\n"); - ret = iscsi_tx_handler(conn); - if (conn->state == STATE_CLOSE) { - conn_close(conn); - dprintf("connection %p closed\n", ci); - } else { - if (ret == 0) { - reloop = 1; - } else { - /* but leave on tx ready list */ - waiting_rdma_slot = 1; - } - } - } - } - } while (reloop); -} - -/* * Could read as many entries as possible without blocking, but * that just fills up a list of tasks. Instead pop out of here * so that tx progress, like issuing rdma reads and writes, can * happen periodically. */ -#define MAX_RX_PROGRESS 8 -static void iser_rx_progress_one(struct iser_device *dev) +static int iser_poll_cq(struct iser_device *dev, int max_wc) { - int ret, numwc = 0; + int ret = 0, numwc = 0; struct ibv_wc wc; struct conn_info *ci; struct recvlist *recvl; @@ -1069,8 +1007,8 @@ static void iser_rx_progress_one(struct iser_device *dev) VALGRIND_MAKE_MEM_DEFINED(&wc, sizeof(wc)); if (wc.status == IBV_WC_SUCCESS) { handle_wc(&wc); - if (++numwc == MAX_RX_PROGRESS) { - num_rx_ready = 1; + if (++numwc == max_wc) { + ret = 1; break; } } else if (wc.status == IBV_WC_WR_FLUSH_ERR) { @@ -1089,24 +1027,121 @@ static void iser_rx_progress_one(struct iser_device *dev) wc.status, (unsigned long long) wc.wr_id); } } + return ret; +} + +static void iser_poll_cq_normal(struct iser_device *dev) +{ + int ret; + + ret = iser_poll_cq(dev,8); + if (ret < 0) + exit(1); + + if (ret == 0) { + ret = ibv_req_notify_cq(dev->cq, 0); + if (ret) { + eprintf("ibv_req_notify_cq: %s\n", strerror(ret)); + exit(1); + } + dev->poll_sched.sched_handler = iser_sched_drain_cq; + } + else + dev->poll_sched.sched_handler = iser_sched_poll_cq; + + tgt_add_sched_event(&dev->poll_sched); +} + +static void iser_poll_cq_drain(struct iser_device *dev) +{ + int ret; + + ret = iser_poll_cq(dev,4); + if (ret < 0) + exit(1); + + dev->poll_sched.sched_handler = iser_sched_poll_cq; + if (ret == 0) { + ret = ibv_req_notify_cq(dev->cq, 0); + if (ret) { + eprintf("ibv_req_notify_cq: %s\n", strerror(ret)); + exit(1); + } + } +} + +static void iser_sched_poll_cq(struct tgt_event *tev) +{ + struct iser_device *dev = tev->data; + iser_poll_cq_normal(dev); +} + +static void iser_sched_drain_cq(struct tgt_event *tev) +{ + struct iser_device *dev = tev->data; + iser_poll_cq_drain(dev); +} + +/* + * Called directly from main event loop when a CQ notification is + * available. + */ +static void iser_cqe_handler(int fd __attribute__((unused)), + int events __attribute__((unused)), + void *data) +{ + struct iser_device *dev = data; + void *cq_context; + int ret; + + ret = ibv_get_cq_event(dev->cq_channel, &dev->cq, &cq_context); + if (ret != 0) { + eprintf("notification, but no CQ event\n"); + exit(1); + } + + ibv_ack_cq_events(dev->cq, 1); + + /* if a poll was previosuly scheduled, remove it as it + will be scheduled if necessary */ + if (dev->poll_sched.scheduled) + tgt_remove_sched_event(&dev->poll_sched); + + iser_poll_cq_normal(dev); } /* - * Only one progress counter, must look across all devs. + * Called from tgtd as a scheduled event + * tries to push tx on a connection, until nothing + * is ready anymore. No progress limit here. */ -static void iser_rx_progress(int *counter __attribute__((unused)), void *data) +static void iser_sched_tx(struct tgt_event *evt) { - struct iser_device *dev; + struct conn_info *ci = evt->data; + struct iscsi_connection *conn = &ci->iscsi_conn; + int ret; dprintf("entry\n"); - num_rx_ready = 0; - if (data == NULL) { - list_for_each_entry(dev, &iser_dev_list, list) - iser_rx_progress_one(dev); - } else { - dev = data; - iser_rx_progress_one(dev); - } + + if (conn->state == STATE_CLOSE) { + dprintf("ignoring tx for closed conn\n"); + return; + } + + for(;;) { + dprintf("trying tx\n"); + ret = iscsi_tx_handler(conn); + if (conn->state == STATE_CLOSE) { + conn_close(conn); + dprintf("connection %p closed\n", ci); + break; + } + if (ret != 0) { + /* but leave on tx ready list */ + waiting_rdma_slot = 1; + break; + } + } } /* @@ -1165,11 +1200,8 @@ static int iscsi_rdma_init(void) INIT_LIST_HEAD(&iser_dev_list); INIT_LIST_HEAD(&iser_conn_list); INIT_LIST_HEAD(&temp_conn); - num_tx_ready = 0; - num_rx_ready = 0; - ret = tgt_counter_event_add(&num_tx_ready, iser_tx_progress, NULL); - ret = tgt_counter_event_add(&num_rx_ready, iser_rx_progress, NULL); - return ret; + + return ret; } /* @@ -1397,10 +1429,6 @@ static void iscsi_iser_write_end(struct iscsi_connection *conn) ci->writeb = 0; /* reset count */ ci->send_comm_event = NULL; - - /* wake up the progress engine to do the done */ - dprintf("inc progress to finish cmd\n"); - num_tx_ready = 1; } /* @@ -1505,7 +1533,7 @@ static int iscsi_rdma_rdma_write(struct iscsi_connection *conn) iscsi_rdma_event_modify(conn, EPOLLIN); } else { /* poke ourselves to do the next rdma */ - num_tx_ready = 1; + tgt_add_sched_event(&ci->tx_sched); } return ret; @@ -1628,7 +1656,7 @@ static void iscsi_rdma_event_modify(struct iscsi_connection *conn, int events) dprintf("tx ready adding %p\n", ci); list_add(&ci->conn_tx_ready, &conn_tx_ready); } - num_tx_ready = 1; + tgt_add_sched_event(&ci->tx_sched); } else { dprintf("tx ready removing %p\n", ci); list_del_init(&ci->conn_tx_ready); diff --git a/usr/tgtd.c b/usr/tgtd.c index 0b1cb4c..287f051 100644 --- a/usr/tgtd.c +++ b/usr/tgtd.c @@ -38,26 +38,13 @@ #include "work.h" #include "util.h" -struct tgt_event { - union { - event_handler_t *handler; - counter_event_handler_t *counter_handler; - }; - union { - int fd; - int *counter; - }; - void *data; - struct list_head e_list; -}; - unsigned long pagesize, pageshift, pagemask; int system_active = 1; static int ep_fd; static char program_name[] = "tgtd"; static LIST_HEAD(tgt_events_list); -static LIST_HEAD(tgt_counter_events_list); +static LIST_HEAD(tgt_sched_events_list); static struct option const long_options[] = { @@ -136,22 +123,6 @@ int tgt_event_add(int fd, int events, event_handler_t handler, void *data) return err; } -int tgt_counter_event_add(int *counter, counter_event_handler_t handler, - void *data) -{ - struct tgt_event *tev; - - tev = zalloc(sizeof(*tev)); - if (!tev) - return -ENOMEM; - - tev->data = data; - tev->counter_handler = handler; - tev->counter = counter; - list_add(&tev->e_list, &tgt_counter_events_list); - return 0; -} - static struct tgt_event *tgt_event_lookup(int fd) { struct tgt_event *tev; @@ -163,17 +134,6 @@ static struct tgt_event *tgt_event_lookup(int fd) return NULL; } -static struct tgt_event *tgt_counter_event_lookup(int *counter) -{ - struct tgt_event *tev; - - list_for_each_entry(tev, &tgt_counter_events_list, e_list) { - if (tev->counter == counter) - return tev; - } - return NULL; -} - void tgt_event_del(int fd) { struct tgt_event *tev; @@ -189,20 +149,6 @@ void tgt_event_del(int fd) free(tev); } -void tgt_counter_event_del(int *counter) -{ - struct tgt_event *tev; - - tev = tgt_counter_event_lookup(counter); - if (!tev) { - eprintf("Cannot find counter event %p\n", counter); - return; - } - - list_del(&tev->e_list); - free(tev); -} - int tgt_event_modify(int fd, int events) { struct epoll_event ev; @@ -221,26 +167,54 @@ int tgt_event_modify(int fd, int events) return epoll_ctl(ep_fd, EPOLL_CTL_MOD, fd, &ev); } -static void event_loop(void) +void tgt_init_sched_event(struct tgt_event *evt, + sched_event_handler_t sched_handler, void *data) +{ + evt->sched_handler = sched_handler; + evt->scheduled = 0; + evt->data = data; + INIT_LIST_HEAD(&evt->e_list); +} + +void tgt_add_sched_event(struct tgt_event *evt) { - int nevent, i, done, timeout = TGTD_TICK_PERIOD * 1000; - struct epoll_event events[1024]; - struct tgt_event *tev, *tevn; - -retry: - /* - * Check the counter events to see if they have any work to run. - */ - do { - done = 1; - list_for_each_entry_safe(tev, tevn, &tgt_counter_events_list, - e_list) { - if (*tev->counter) { - done = 0; - tev->counter_handler(tev->counter, tev->data); - } - } - } while (!done); + if (!evt->scheduled) { + evt->scheduled = 1; + list_add_tail(&evt->e_list,&tgt_sched_events_list); + } +} + +void tgt_remove_sched_event(struct tgt_event *evt) +{ + if (evt->scheduled) { + evt->scheduled = 0; + list_del_init(&evt->e_list); + } +} + +static void tgt_exec_scheduled(void) +{ + struct list_head *last_sched; + struct tgt_event *tev, *tevn; + + if (list_empty(&tgt_sched_events_list)) + return; + + /* execute only work scheduled till now */ + last_sched = tgt_sched_events_list.prev; + list_for_each_entry_safe(tev,tevn,&tgt_sched_events_list,e_list) { + tgt_remove_sched_event(tev); + tev->sched_handler(tev); + if (&tev->e_list == last_sched) + break; + } +} + +static void tgt_poll_events(int timeout) +{ + int nevent, i; + struct tgt_event *tev; + struct epoll_event events[1024]; nevent = epoll_wait(ep_fd, events, ARRAY_SIZE(events), timeout); if (nevent < 0) { @@ -255,9 +229,19 @@ retry: } } else schedule(); +} - if (system_active) - goto retry; +static void event_loop(void) +{ + int timeout, wait_timeout = TGTD_TICK_PERIOD * 1000; + + while (system_active) { + tgt_exec_scheduled(); + /* wait if no scheduled work, poll if there is */ + timeout = list_empty(&tgt_sched_events_list) ? + wait_timeout : 0; + tgt_poll_events(timeout); + } } static int lld_init(int *use_kernel, char *args) diff --git a/usr/tgtd.h b/usr/tgtd.h index 4febcd3..0e226f7 100644 --- a/usr/tgtd.h +++ b/usr/tgtd.h @@ -206,13 +206,20 @@ extern int tgt_bind_host_to_target(int tid, int host_no); extern int tgt_unbind_host_to_target(int tid, int host_no); extern int tgt_bound_target_lookup(int host_no); -typedef void (event_handler_t)(int fd, int events, void *data); -typedef void (counter_event_handler_t)(int *counter, void *data); +struct tgt_event; +typedef void (* sched_event_handler_t)(struct tgt_event *tev); + +extern void tgt_init_sched_event(struct tgt_event *evt, + sched_event_handler_t sched_handler, void *data); + +typedef void (* event_handler_t)(int fd, int events, void *data); + extern int tgt_event_add(int fd, int events, event_handler_t handler, void *data); -extern int tgt_counter_event_add(int *counter, counter_event_handler_t handler, - void *data); extern void tgt_event_del(int fd); -extern void tgt_counter_event_del(int *counter); + +extern void tgt_add_sched_event(struct tgt_event *evt); +extern void tgt_remove_sched_event(struct tgt_event *evt); + extern int tgt_event_modify(int fd, int events); extern int target_cmd_queue(int tid, struct scsi_cmd *cmd); extern void target_cmd_done(struct scsi_cmd *cmd); @@ -262,4 +269,17 @@ extern int dtd_load_unload(int tid, uint64_t lun, int load, char *file); extern int register_backingstore_template(struct backingstore_template *bst); extern struct backingstore_template *get_backingstore_template(const char *name); +struct tgt_event { + union { + event_handler_t handler; + sched_event_handler_t sched_handler; + }; + union { + int fd; + int scheduled; + }; + void *data; + struct list_head e_list; +}; + #endif -- 1.5.6.5 -- To unsubscribe from this list: send the line "unsubscribe stgt" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html