Currently, the main process handles SCSI protocol processing (and network I/O for iSCSI), and four I/O threads runs per lun to handle disk I/Os. The current model doesn't scale with the number of targets if you have fast network (10GbE) and disk drives (SSDs). With this patch, we use pthread per target for iSCSI/TCP (not iSER). Target's pthread handles SCSI protocol processing and network I/Os for the target, and four I/O threads runs per lun to handle disk I/Os. Note that the pthread-per-target model is enabled only if tgt uses signalfd. Even if with the main process model, tgt is much faster with signalfd-capable kernels. Linux 2.6.22 or newer is strongly recommended. Signed-off-by: FUJITA Tomonori <fujita.tomonori@xxxxxxxxxxxxx> --- usr/bs.c | 55 +++++++++++++++++++++--------------- usr/iscsi/conn.c | 5 ++- usr/iscsi/iscsi_rdma.c | 8 +++++ usr/iscsi/iscsi_tcp.c | 56 ++++++++++++++++++++++++++++++++++-- usr/iscsi/iscsid.c | 9 +++++- usr/iscsi/iscsid.h | 9 ++++++ usr/iscsi/target.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++- usr/iscsi/transport.h | 3 ++ usr/target.h | 2 + usr/tgtd.h | 8 +++++ 10 files changed, 197 insertions(+), 31 deletions(-) diff --git a/usr/bs.c b/usr/bs.c index e74cc13..a29a5f4 100644 --- a/usr/bs.c +++ b/usr/bs.c @@ -34,14 +34,14 @@ #include "list.h" #include "tgtd.h" +#include "target.h" #include "tgtadm_error.h" #include "util.h" #include "bs_thread.h" static LIST_HEAD(bst_list); -static LIST_HEAD(finished_list); -static pthread_mutex_t finished_lock; +struct bs_finish bs_finish, *bsf = &bs_finish; int sig_fd = -1; @@ -87,15 +87,15 @@ retry: goto out; } - pthread_mutex_lock(&finished_lock); + pthread_mutex_lock(&bsf->finished_lock); retest: - if (list_empty(&finished_list)) { - pthread_cond_wait(&finished_cond, &finished_lock); + if (list_empty(&bsf->finished_list)) { + pthread_cond_wait(&finished_cond, &bsf->finished_lock); goto retest; } - while (!list_empty(&finished_list)) { - cmd = list_first_entry(&finished_list, + while (!list_empty(&bsf->finished_list)) { + cmd = list_first_entry(&bsf->finished_list, struct scsi_cmd, bs_list); dprintf("found %p\n", cmd); @@ -104,7 +104,7 @@ retest: list_add_tail(&cmd->bs_list, &ack_list); } - pthread_mutex_unlock(&finished_lock); + pthread_mutex_unlock(&bsf->finished_lock); nr = 1; rewrite: @@ -154,9 +154,10 @@ rewrite: } } -static void bs_sig_request_done(int fd, int events, void *data) +void bs_sig_request_done(int fd, int events, void *data) { int ret; + struct bs_finish *b = data; struct scsi_cmd *cmd; struct signalfd_siginfo siginfo[16]; LIST_HEAD(list); @@ -166,9 +167,9 @@ static void bs_sig_request_done(int fd, int events, void *data) return; } - pthread_mutex_lock(&finished_lock); - list_splice_init(&finished_list, &list); - pthread_mutex_unlock(&finished_lock); + pthread_mutex_lock(&b->finished_lock); + list_splice_init(&b->finished_list, &list); + pthread_mutex_unlock(&b->finished_lock); while (!list_empty(&list)) { cmd = list_first_entry(&list, struct scsi_cmd, bs_list); @@ -184,6 +185,7 @@ static void *bs_thread_worker_fn(void *arg) struct bs_thread_info *info = arg; struct scsi_cmd *cmd; sigset_t set; + struct bs_finish *tbsf; sigfillset(&set); sigprocmask(SIG_BLOCK, &set, NULL); @@ -207,16 +209,24 @@ static void *bs_thread_worker_fn(void *arg) cmd = list_first_entry(&info->pending_list, struct scsi_cmd, bs_list); + + if (cmd->c_target->bsf) + tbsf = cmd->c_target->bsf; + else + tbsf = bsf; + list_del(&cmd->bs_list); pthread_mutex_unlock(&info->pending_lock); info->request_fn(cmd); - pthread_mutex_lock(&finished_lock); - list_add_tail(&cmd->bs_list, &finished_list); - pthread_mutex_unlock(&finished_lock); + pthread_mutex_lock(&tbsf->finished_lock); + list_add_tail(&cmd->bs_list, &tbsf->finished_list); + pthread_mutex_unlock(&tbsf->finished_lock); - if (sig_fd < 0) + if (cmd->c_target->bsf) + pthread_kill(cmd->c_target->bsf->thread, SIGUSR2); + else if (sig_fd < 0) pthread_cond_signal(&finished_cond); else kill(getpid(), SIGUSR2); @@ -225,13 +235,11 @@ static void *bs_thread_worker_fn(void *arg) pthread_exit(NULL); } -static int bs_init_signalfd(void) +static int bs_init_signalfd(struct bs_finish *b) { sigset_t mask; int ret; - pthread_mutex_init(&finished_lock, NULL); - sigemptyset(&mask); sigaddset(&mask, SIGUSR2); sigprocmask(SIG_BLOCK, &mask, NULL); @@ -240,7 +248,7 @@ static int bs_init_signalfd(void) if (sig_fd < 0) return 1; - ret = tgt_event_add(sig_fd, EPOLLIN, bs_sig_request_done, NULL); + ret = tgt_event_add(sig_fd, EPOLLIN, bs_sig_request_done, b); if (ret < 0) { close (sig_fd); sig_fd = -1; @@ -256,7 +264,6 @@ static int bs_init_notify_thread(void) int ret; pthread_cond_init(&finished_cond, NULL); - pthread_mutex_init(&finished_lock, NULL); ret = pipe(command_fd); if (ret) { @@ -298,7 +305,6 @@ close_command_fd: close(command_fd[1]); destroy_cond_mutex: pthread_cond_destroy(&finished_cond); - pthread_mutex_destroy(&finished_lock); return 1; } @@ -307,7 +313,10 @@ int bs_init(void) { int ret; - ret = bs_init_signalfd(); + pthread_mutex_init(&bsf->finished_lock, NULL); + INIT_LIST_HEAD(&bsf->finished_list); + + ret = bs_init_signalfd(bsf); if (!ret) { eprintf("use signalfd notification\n"); return 0; diff --git a/usr/iscsi/conn.c b/usr/iscsi/conn.c index ba7a58f..d8601e1 100644 --- a/usr/iscsi/conn.c +++ b/usr/iscsi/conn.c @@ -23,6 +23,7 @@ #include <string.h> #include <errno.h> #include <sys/stat.h> +#include <sys/epoll.h> #include "iscsid.h" #include "tgtd.h" @@ -231,7 +232,9 @@ int conn_close_force(uint32_t tid, uint64_t sid, uint32_t cid) list_for_each_entry(conn, &session->conn_list, clist) { if (conn->cid == cid) { eprintf("close %" PRIx64 " %u\n", sid, cid); - conn_close(conn); + conn->state = STATE_CLOSE; + conn->tp->ep_event_modify(conn, + EPOLLIN|EPOLLOUT|EPOLLERR); return TGTADM_SUCCESS; } } diff --git a/usr/iscsi/iscsi_rdma.c b/usr/iscsi/iscsi_rdma.c index 63edebf..115d774 100644 --- a/usr/iscsi/iscsi_rdma.c +++ b/usr/iscsi/iscsi_rdma.c @@ -1194,6 +1194,8 @@ static int iscsi_rdma_init(void) INIT_LIST_HEAD(&iser_conn_list); INIT_LIST_HEAD(&temp_conn); + iscsi_rdma_enabled = 1; + return ret; } @@ -1246,6 +1248,11 @@ static int iscsi_rdma_login_complete(struct iscsi_connection *conn) return ret; } +static void iscsi_rdma_nexus_init(struct iscsi_connection *conn) +{ + conn->tp->ep_event_modify(conn, EPOLLIN); +} + /* * Copy the remote va and stag that were temporarily saved in conn_info. */ @@ -1725,6 +1732,7 @@ static struct iscsi_transport iscsi_iser = { .data_padding = 1, .ep_init = iscsi_rdma_init, .ep_login_complete = iscsi_rdma_login_complete, + .ep_nexus_init = iscsi_rdma_nexus_init, .alloc_task = iscsi_iser_alloc_task, .free_task = iscsi_iser_free_task, .ep_read = iscsi_iser_read, diff --git a/usr/iscsi/iscsi_tcp.c b/usr/iscsi/iscsi_tcp.c index 8fc145f..d1edd84 100644 --- a/usr/iscsi/iscsi_tcp.c +++ b/usr/iscsi/iscsi_tcp.c @@ -31,6 +31,7 @@ #include <netinet/tcp.h> #include <sys/epoll.h> #include <sys/socket.h> +#include <pthread.h> #include "iscsid.h" #include "tgtd.h" @@ -43,6 +44,7 @@ static struct iscsi_transport iscsi_tcp; struct iscsi_tcp_connection { int fd; + int pthread; struct iscsi_connection iscsi_conn; }; @@ -153,6 +155,7 @@ out: static void iscsi_tcp_event_handler(int fd, int events, void *data) { struct iscsi_connection *conn = (struct iscsi_connection *) data; + struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn); if (events & EPOLLIN) iscsi_rx_handler(conn); @@ -165,7 +168,19 @@ static void iscsi_tcp_event_handler(int fd, int events, void *data) if (conn->state == STATE_CLOSE) { dprintf("connection closed %p\n", conn); - conn_close(conn); + if (tcp_conn->pthread) { + struct iscsi_target *target = conn->session->target; + + pthread_mutex_lock(&target->event_lock); + do_tgt_event_del(target->efd, &target->events_list, + tcp_conn->fd); + pthread_mutex_unlock(&target->event_lock); + /* let the main thread handle this */ + tcp_conn->pthread = 0; + tgt_event_modify(tcp_conn->fd, EPOLLIN|EPOLLOUT|EPOLLERR); + } else { + conn_close(conn); + } } } @@ -263,6 +278,29 @@ static int iscsi_tcp_conn_login_complete(struct iscsi_connection *conn) return 0; } +static void iscsi_tcp_conn_nexus_init(struct iscsi_connection *conn) +{ + struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn); + struct iscsi_target *target = conn->session->target; + + if (iscsi_pthread_per_target()) { + /* remove the conn from the main thread. */ + conn->tp->ep_event_modify(conn, 0); + + pthread_mutex_lock(&target->event_lock); + + do_tgt_event_add(target->efd, &target->events_list, + tcp_conn->fd, EPOLLIN, + iscsi_tcp_event_handler, conn); + + pthread_mutex_unlock(&target->event_lock); + + tcp_conn->pthread = 1; + } + + conn->tp->ep_event_modify(conn, EPOLLIN); +} + static size_t iscsi_tcp_read(struct iscsi_connection *conn, void *buf, size_t nbytes) { @@ -336,9 +374,18 @@ static void iscsi_event_modify(struct iscsi_connection *conn, int events) struct iscsi_tcp_connection *tcp_conn = TCP_CONN(conn); int ret; - ret = tgt_event_modify(tcp_conn->fd, events); - if (ret) - eprintf("tgt_event_modify failed\n"); + if (tcp_conn->pthread) { + struct iscsi_target *target = conn->session->target; + + pthread_mutex_lock(&target->event_lock); + do_tgt_event_modify(target->efd, &target->events_list, + tcp_conn->fd, events); + pthread_mutex_unlock(&target->event_lock); + } else { + ret = tgt_event_modify(tcp_conn->fd, events); + if (ret) + eprintf("tgt_event_modify failed\n"); + } } static struct iscsi_task *iscsi_tcp_alloc_task(struct iscsi_connection *conn, @@ -391,6 +438,7 @@ static struct iscsi_transport iscsi_tcp = { .ep_init = iscsi_tcp_init, .ep_exit = iscsi_tcp_exit, .ep_login_complete = iscsi_tcp_conn_login_complete, + .ep_nexus_init = iscsi_tcp_conn_nexus_init, .alloc_task = iscsi_tcp_alloc_task, .free_task = iscsi_tcp_free_task, .ep_read = iscsi_tcp_read, diff --git a/usr/iscsi/iscsid.c b/usr/iscsi/iscsid.c index dcca384..b4e0969 100644 --- a/usr/iscsi/iscsid.c +++ b/usr/iscsi/iscsid.c @@ -73,6 +73,13 @@ enum { IOSTATE_TX_END, }; +int iscsi_rdma_enabled; + +int iscsi_pthread_per_target(void) +{ + return sig_fd >= 0 && !iscsi_rdma_enabled; +} + void conn_read_pdu(struct iscsi_connection *conn) { conn->rx_iostate = IOSTATE_RX_BHS; @@ -2224,7 +2231,7 @@ finish: else { conn->state = STATE_SCSI; conn_read_pdu(conn); - conn->tp->ep_event_modify(conn, EPOLLIN); + conn->tp->ep_nexus_init(conn); } break; case STATE_EXIT: diff --git a/usr/iscsi/iscsid.h b/usr/iscsi/iscsid.h index 6b982cb..1e70d81 100644 --- a/usr/iscsi/iscsid.h +++ b/usr/iscsi/iscsid.h @@ -244,6 +244,13 @@ struct iscsi_target { int nr_sessions; struct list_head isns_list; + + int efd; + pthread_mutex_t event_lock; + struct list_head events_list; + + struct bs_finish bsfin; + int stop_pthread; }; enum task_flags { @@ -310,6 +317,8 @@ extern int iscsi_target_show(int mode, int tid, uint64_t sid, uint32_t cid, int iscsi_target_update(int mode, int op, int tid, uint64_t sid, uint64_t lun, uint32_t cid, char *name); +int iscsi_pthread_per_target(void); + /* param.c */ int param_index_by_name(char *name, struct iscsi_key *keys); diff --git a/usr/iscsi/target.c b/usr/iscsi/target.c index c6ac031..b547626 100644 --- a/usr/iscsi/target.c +++ b/usr/iscsi/target.c @@ -25,6 +25,7 @@ #include <unistd.h> #include <netdb.h> #include <sys/stat.h> +#include <sys/epoll.h> #include <sys/un.h> #include <netinet/in.h> #include <sys/socket.h> @@ -32,10 +33,12 @@ #include <netinet/tcp.h> #include <netinet/ip.h> #include <arpa/inet.h> +#include <pthread.h> #include "iscsid.h" #include "tgtadm.h" #include "tgtd.h" #include "target.h" +#include "util.h" LIST_HEAD(iscsi_targets_list); @@ -252,12 +255,63 @@ void iscsi_target_destroy(int tid) } list_del(&target->tlist); + + pthread_mutex_init(&target->event_lock, NULL); + + if (target->bsfin.thread) { + target->stop_pthread = 1; + pthread_kill(target->bsfin.thread, SIGUSR2); + + pthread_join(target->bsfin.thread, NULL); + pthread_mutex_destroy(&target->bsfin.finished_lock); + } + + close(target->efd); free(target); isns_target_deregister(tgt_targetname(tid)); return; } +static void *iscsi_thread_fn(void *arg) +{ + struct iscsi_target *t = arg; + struct epoll_event events[1024]; + struct event_data *tev; + sigset_t mask; + int nevent, i; + + sigemptyset(&mask); + sigaddset(&mask, SIGUSR2); + pthread_sigmask(SIG_BLOCK, &mask, NULL); + + pthread_mutex_lock(&t->event_lock); + + do_tgt_event_add(t->efd, &t->events_list, sig_fd, EPOLLIN, + bs_sig_request_done, &t->bsfin); + + pthread_mutex_unlock(&t->event_lock); + +retry: + nevent = epoll_wait(t->efd, events, ARRAY_SIZE(events), 1000); + if (nevent < 0) { + if (errno != EINTR) { + eprintf("%m\n"); + exit(1); + } + } else if (nevent) { + for (i = 0; i < nevent; i++) { + tev = (struct event_data *) events[i].data.ptr; + tev->handler(tev->fd, events[i].events, tev->data); + } + } + + if (!t->stop_pthread) + goto retry; + + pthread_exit(NULL); +} + int iscsi_target_create(struct target *t) { int tid = t->tid; @@ -288,11 +342,15 @@ int iscsi_target_create(struct target *t) [ISCSI_PARAM_MAX_OUTST_PDU] = {0, 0}, /* not in open-iscsi */ }; - target = malloc(sizeof(*target)); + target = zalloc(sizeof(*target)); if (!target) return -ENOMEM; - memset(target, 0, sizeof(*target)); + target->efd = epoll_create(128); + if (target->efd < 0) { + free(target); + return -EINVAL; + } memcpy(target->session_param, default_tgt_session_param, sizeof(target->session_param)); @@ -300,10 +358,21 @@ int iscsi_target_create(struct target *t) INIT_LIST_HEAD(&target->tlist); INIT_LIST_HEAD(&target->sessions_list); INIT_LIST_HEAD(&target->isns_list); + INIT_LIST_HEAD(&target->events_list); target->tid = tid; list_add_tail(&target->tlist, &iscsi_targets_list); isns_target_register(tgt_targetname(tid)); + + if (iscsi_pthread_per_target()) { + pthread_create(&target->bsfin.thread, NULL, iscsi_thread_fn, target); + + pthread_mutex_init(&target->bsfin.finished_lock, NULL); + INIT_LIST_HEAD(&target->bsfin.finished_list); + t->bsf = &target->bsfin; + eprintf("create thread %u\n", (unsigned)target->bsfin.thread); + } + return 0; } diff --git a/usr/iscsi/transport.h b/usr/iscsi/transport.h index 92a6f0a..c94b86b 100644 --- a/usr/iscsi/transport.h +++ b/usr/iscsi/transport.h @@ -4,6 +4,8 @@ #include <sys/socket.h> #include "list.h" +extern int iscsi_rdma_enabled; + struct iscsi_connection; struct iscsi_task; @@ -17,6 +19,7 @@ struct iscsi_transport { int (*ep_init) (void); void (*ep_exit) (void); int (*ep_login_complete)(struct iscsi_connection *conn); + void (*ep_nexus_init)(struct iscsi_connection *conn); struct iscsi_task *(*alloc_task)(struct iscsi_connection *conn, size_t ext_len); void (*free_task)(struct iscsi_task *task); diff --git a/usr/target.h b/usr/target.h index 9283431..4607fc4 100644 --- a/usr/target.h +++ b/usr/target.h @@ -39,6 +39,8 @@ struct target { struct list_head acl_list; struct tgt_account account; + + struct bs_finish *bsf; }; struct it_nexus { diff --git a/usr/tgtd.h b/usr/tgtd.h index 79d9c88..b8541c8 100644 --- a/usr/tgtd.h +++ b/usr/tgtd.h @@ -334,6 +334,14 @@ struct event_data { extern int sig_fd; +struct bs_finish { + struct list_head finished_list; + pthread_mutex_t finished_lock; + pthread_t thread; +}; + +void bs_sig_request_done(int fd, int events, void *data); + int do_tgt_event_add(int efd, struct list_head *list, int fd, int events, event_handler_t handler, void *data); void do_tgt_event_del(int efd, struct list_head *list, int fd); -- 1.6.5 -- To unsubscribe from this list: send the line "unsubscribe stgt" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html