On 07/06/2020 23:36, Pavel Begunkov wrote: > On 07/06/2020 18:02, Jens Axboe wrote: >> On 6/3/20 7:46 AM, Pavel Begunkov wrote: >>> On 02/06/2020 04:16, Xiaoguang Wang wrote: >>>> hi Jens, Pavel, >>>> >>>> Will you have a look at this V5 version? Or we hold on this patchset, and >>>> do the refactoring work related io_wq_work firstly. >>> >>> It's entirely up to Jens, but frankly, I think it'll bring more bugs than >>> merits in the current state of things. >> >> Well, I'd really like to reduce the overhead where we can, particularly >> when the overhead just exists to cater to the slow path. >> >> Planning on taking the next week off and not do too much, but I'll see >> if I can get some testing in with the current patches. >> > > I just think it should not be done at expense of robustness. > > e.g. instead of having tons of if's around ->func, we can get rid of > it and issue everything with io_wq_submit_work(). And there are plenty > of pros of doing that: > - freeing some space in io_kiocb (in req.work in particular) > - removing much of stuff with nice negative diffstat > - helping this series > - even safer than now -- can't be screwed with memcpy(req). > > Extra switch-lookup in io-wq shouldn't even be noticeable considering > punting overhead. And even though io-wq loses some flexibility, as for > me that's fine as long as there is only 1 user. How about diff below? if split and cooked properly. 3 files changed, 73 insertions(+), 164 deletions(-) diff --git a/fs/io-wq.c b/fs/io-wq.c index 2bfa9117bc28..a44ad3b98886 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -112,6 +112,7 @@ struct io_wq { unsigned long state; free_work_fn *free_work; + io_wq_work_fn *do_work; struct task_struct *manager; struct user_struct *user; @@ -528,7 +529,7 @@ static void io_worker_handle_work(struct io_worker *worker) hash = io_get_work_hash(work); linked = old_work = work; - linked->func(&linked); + wq->do_work(&linked); linked = (old_work == linked) ? NULL : linked; work = next_hashed; @@ -785,7 +786,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) struct io_wq_work *old_work = work; work->flags |= IO_WQ_WORK_CANCEL; - work->func(&work); + wq->do_work(&work); work = (work == old_work) ? NULL : work; wq->free_work(old_work); } while (work); @@ -1027,7 +1028,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) int ret = -ENOMEM, node; struct io_wq *wq; - if (WARN_ON_ONCE(!data->free_work)) + if (WARN_ON_ONCE(!data->free_work || !data->do_work)) return ERR_PTR(-EINVAL); wq = kzalloc(sizeof(*wq), GFP_KERNEL); @@ -1041,6 +1042,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) } wq->free_work = data->free_work; + wq->do_work = data->do_work; /* caller must already hold a reference to this */ wq->user = data->user; @@ -1097,7 +1099,7 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) { - if (data->free_work != wq->free_work) + if (data->free_work != wq->free_work || data->do_work != wq->do_work) return false; return refcount_inc_not_zero(&wq->use_refs); diff --git a/fs/io-wq.h b/fs/io-wq.h index df8a4cd3236d..f3bb596f5a3f 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -85,7 +85,6 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - void (*func)(struct io_wq_work **); struct files_struct *files; struct mm_struct *mm; const struct cred *creds; @@ -94,9 +93,9 @@ struct io_wq_work { pid_t task_pid; }; -#define INIT_IO_WORK(work, _func) \ +#define INIT_IO_WORK(work) \ do { \ - *(work) = (struct io_wq_work){ .func = _func }; \ + *(work) = (struct io_wq_work){}; \ } while (0) \ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) @@ -108,10 +107,12 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) } typedef void (free_work_fn)(struct io_wq_work *); +typedef void (io_wq_work_fn)(struct io_wq_work **); struct io_wq_data { struct user_struct *user; + io_wq_work_fn *do_work; free_work_fn *free_work; }; diff --git a/fs/io_uring.c b/fs/io_uring.c index 3aebbf96c123..3bfce882ede5 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -541,6 +541,7 @@ enum { REQ_F_POLLED_BIT, REQ_F_BUFFER_SELECTED_BIT, REQ_F_NO_FILE_TABLE_BIT, + REQ_F_QUEUE_TIMEOUT_BIT, /* not a real bit, just to check we're not overflowing the space */ __REQ_F_LAST_BIT, @@ -596,6 +597,8 @@ enum { REQ_F_BUFFER_SELECTED = BIT(REQ_F_BUFFER_SELECTED_BIT), /* doesn't need file table for this request */ REQ_F_NO_FILE_TABLE = BIT(REQ_F_NO_FILE_TABLE_BIT), + /* need to queue linked timeout */ + REQ_F_QUEUE_TIMEOUT = BIT(REQ_F_QUEUE_TIMEOUT_BIT), }; struct async_poll { @@ -1579,16 +1582,6 @@ static void io_free_req(struct io_kiocb *req) io_queue_async_work(nxt); } -static void io_link_work_cb(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - struct io_kiocb *link; - - link = list_first_entry(&req->link_list, struct io_kiocb, link_list); - io_queue_linked_timeout(link); - io_wq_submit_work(workptr); -} - static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) { struct io_kiocb *link; @@ -1600,7 +1593,7 @@ static void io_wq_assign_next(struct io_wq_work **workptr, struct io_kiocb *nxt) *workptr = &nxt->work; link = io_prep_linked_timeout(nxt); if (link) - nxt->work.func = io_link_work_cb; + nxt->flags |= REQ_F_QUEUE_TIMEOUT; } /* @@ -2940,23 +2933,15 @@ static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static bool io_req_cancelled(struct io_kiocb *req) -{ - if (req->work.flags & IO_WQ_WORK_CANCEL) { - req_set_fail_links(req); - io_cqring_add_event(req, -ECANCELED); - io_put_req(req); - return true; - } - - return false; -} - -static void __io_fsync(struct io_kiocb *req) +static int io_fsync(struct io_kiocb *req, bool force_nonblock) { loff_t end = req->sync.off + req->sync.len; int ret; + /* fsync always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + ret = vfs_fsync_range(req->file, req->sync.off, end > 0 ? end : LLONG_MAX, req->sync.flags & IORING_FSYNC_DATASYNC); @@ -2964,53 +2949,9 @@ static void __io_fsync(struct io_kiocb *req) req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); -} - -static void io_fsync_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_fsync(req); - io_steal_work(req, workptr); -} - -static int io_fsync(struct io_kiocb *req, bool force_nonblock) -{ - /* fsync always requires a blocking context */ - if (force_nonblock) { - req->work.func = io_fsync_finish; - return -EAGAIN; - } - __io_fsync(req); return 0; } -static void __io_fallocate(struct io_kiocb *req) -{ - int ret; - - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; - ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, - req->sync.len); - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - io_put_req(req); -} - -static void io_fallocate_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_fallocate(req); - io_steal_work(req, workptr); -} - static int io_fallocate_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { @@ -3028,13 +2969,20 @@ static int io_fallocate_prep(struct io_kiocb *req, static int io_fallocate(struct io_kiocb *req, bool force_nonblock) { + int ret; + /* fallocate always requiring blocking context */ - if (force_nonblock) { - req->work.func = io_fallocate_finish; + if (force_nonblock) return -EAGAIN; - } - __io_fallocate(req); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = req->fsize; + ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off, + req->sync.len); + current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + io_put_req(req); return 0; } @@ -3479,53 +3427,37 @@ static int io_close_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) req->close.fd == req->ctx->ring_fd) return -EBADF; + req->close.put_file = NULL; return 0; } -/* only called when __close_fd_get_file() is done */ -static void __io_close_finish(struct io_kiocb *req) -{ - int ret; - - ret = filp_close(req->close.put_file, req->work.files); - if (ret < 0) - req_set_fail_links(req); - io_cqring_add_event(req, ret); - fput(req->close.put_file); - io_put_req(req); -} - -static void io_close_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - /* not cancellable, don't do io_req_cancelled() */ - __io_close_finish(req); - io_steal_work(req, workptr); -} - static int io_close(struct io_kiocb *req, bool force_nonblock) { + struct io_close *close = &req->close; int ret; - req->close.put_file = NULL; - ret = __close_fd_get_file(req->close.fd, &req->close.put_file); - if (ret < 0) - return (ret == -ENOENT) ? -EBADF : ret; + /* might be already done during nonblock submission */ + if (!close->put_file) { + ret = __close_fd_get_file(close->fd, &close->put_file); + if (ret < 0) + return (ret == -ENOENT) ? -EBADF : ret; + } /* if the file has a flush method, be safe and punt to async */ - if (req->close.put_file->f_op->flush && force_nonblock) { + if (close->put_file->f_op->flush && force_nonblock) { /* avoid grabbing files - we don't need the files */ req->flags |= REQ_F_NO_FILE_TABLE | REQ_F_MUST_PUNT; - req->work.func = io_close_finish; return -EAGAIN; } - /* - * No ->flush(), safely close from here and just punt the - * fput() to async context. - */ - __io_close_finish(req); + /* No ->flush() or already async, safely close from here */ + ret = filp_close(close->put_file, req->work.files); + if (ret < 0) + req_set_fail_links(req); + io_cqring_add_event(req, ret); + fput(close->put_file); + close->put_file = NULL; + io_put_req(req); return 0; } @@ -3547,38 +3479,20 @@ static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static void __io_sync_file_range(struct io_kiocb *req) +static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) { int ret; + /* sync_file_range always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + ret = sync_file_range(req->file, req->sync.off, req->sync.len, req->sync.flags); if (ret < 0) req_set_fail_links(req); io_cqring_add_event(req, ret); io_put_req(req); -} - - -static void io_sync_file_range_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_sync_file_range(req); - io_steal_work(req, workptr); -} - -static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) -{ - /* sync_file_range always requires a blocking context */ - if (force_nonblock) { - req->work.func = io_sync_file_range_finish; - return -EAGAIN; - } - - __io_sync_file_range(req); return 0; } @@ -4000,49 +3914,27 @@ static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return 0; } -static int __io_accept(struct io_kiocb *req, bool force_nonblock) +static int io_accept(struct io_kiocb *req, bool force_nonblock) { struct io_accept *accept = &req->accept; - unsigned file_flags; + unsigned int file_flags = force_nonblock ? O_NONBLOCK : 0; int ret; - file_flags = force_nonblock ? O_NONBLOCK : 0; ret = __sys_accept4_file(req->file, file_flags, accept->addr, accept->addr_len, accept->flags, accept->nofile); if (ret == -EAGAIN && force_nonblock) return -EAGAIN; - if (ret == -ERESTARTSYS) - ret = -EINTR; - if (ret < 0) + if (ret < 0) { + if (ret == -ERESTARTSYS) + ret = -EINTR; req_set_fail_links(req); + } io_cqring_add_event(req, ret); io_put_req(req); return 0; } -static void io_accept_finish(struct io_wq_work **workptr) -{ - struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work); - - if (io_req_cancelled(req)) - return; - __io_accept(req, false); - io_steal_work(req, workptr); -} - -static int io_accept(struct io_kiocb *req, bool force_nonblock) -{ - int ret; - - ret = __io_accept(req, force_nonblock); - if (ret == -EAGAIN && force_nonblock) { - req->work.func = io_accept_finish; - return -EAGAIN; - } - return 0; -} - static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; @@ -5434,12 +5326,25 @@ static int io_issue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, return 0; } +static void io_link_work_cb(struct io_kiocb *req) +{ + struct io_kiocb *link; + + if (!(req->flags & REQ_F_QUEUE_TIMEOUT)) + return; + + link = list_first_entry(&req->link_list, struct io_kiocb, link_list); + io_queue_linked_timeout(link); +} + static void io_wq_submit_work(struct io_wq_work **workptr) { struct io_wq_work *work = *workptr; struct io_kiocb *req = container_of(work, struct io_kiocb, work); int ret = 0; + io_link_work_cb(req); + /* if NO_CANCEL is set, we must still run the work */ if ((work->flags & (IO_WQ_WORK_CANCEL|IO_WQ_WORK_NO_CANCEL)) == IO_WQ_WORK_CANCEL) { @@ -5974,7 +5879,7 @@ static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, refcount_set(&req->refs, 2); req->task = NULL; req->result = 0; - INIT_IO_WORK(&req->work, io_wq_submit_work); + INIT_IO_WORK(&req->work); if (unlikely(req->opcode >= IORING_OP_LAST)) return -EINVAL; @@ -6990,6 +6895,7 @@ static int io_init_wq_offload(struct io_ring_ctx *ctx, data.user = ctx->user; data.free_work = io_free_work; + data.do_work = io_wq_submit_work; if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { /* Do QD, or 4 * CPUS, whatever is smallest */ -- Pavel Begunkov