IORING_OP_GETDENTS64 behaves like getdents64(2) and takes the same arguments. Signed-off-by: Lennert Buytenhek <buytenh@xxxxxxxxxxxxxx> --- This seems to work OK, but I'd appreciate a review from someone more familiar with io_uring internals than I am, as I'm not entirely sure I did everything quite right. A dumb test program for IORING_OP_GETDENTS64 is available here: https://krautbox.wantstofly.org/~buytenh/uringfind.c This does more or less what find(1) does: it scans recursively through a directory tree and prints the names of all directories and files it encounters along the way -- but then using io_uring. (The uring version prints the names of encountered files and directories in an order that's determined by SQE completion order, which is somewhat nondeterministic and likely to differ between runs.) On a directory tree with 14-odd million files in it that's on a six-drive (spinning disk) btrfs raid, find(1) takes: # echo 3 > /proc/sys/vm/drop_caches # time find /mnt/repo > /dev/null real 24m7.815s user 0m15.015s sys 0m48.340s # And the io_uring version takes: # echo 3 > /proc/sys/vm/drop_caches # time ./uringfind /mnt/repo > /dev/null real 10m29.064s user 0m4.347s sys 0m1.677s # These timings are repeatable and consistent to within a few seconds. (btrfs seems to be sending most metadata reads to the same drive in the array during this test, even though this filesystem is using the raid1c4 profile for metadata, so I suspect that more drive-level parallelism can be extracted with some btrfs tweaks.) The fully cached case also shows some speedup for the io_uring version: # time find /mnt/repo > /dev/null real 0m5.223s user 0m1.926s sys 0m3.268s # vs: # time ./uringfind /mnt/repo > /dev/null real 0m3.604s user 0m2.417s sys 0m0.793s # That said, the point of this patch isn't primarily to enable lightning-fast find(1) or du(1), but more to complete the set of filesystem I/O primitives available via io_uring, so that applications can do all of their filesystem I/O using the same mechanism, without having to manually punt some of their work out to worker threads -- and indeed, an object storage backend server that I wrote a while ago can run with a pure io_uring based event loop with this patch. One open question is whether IORING_OP_GETDENTS64 should be more like pread(2) and allow passing in a starting offset to read from the directory from. (This would require some more surgery in fs/readdir.c.) fs/io_uring.c | 51 ++++++++++++++++++++++++++++++++++++++++++ fs/readdir.c | 25 ++++++++++++++------ include/linux/fs.h | 4 +++ include/uapi/linux/io_uring.h | 1 4 files changed, 73 insertions(+), 8 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 985a9e3f976d..5d79b9668ee0 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -572,6 +572,12 @@ struct io_unlink { struct filename *filename; }; +struct io_getdents64 { + struct file *file; + struct linux_dirent64 __user *dirent; + unsigned int count; +}; + struct io_completion { struct file *file; struct list_head list; @@ -699,6 +705,7 @@ struct io_kiocb { struct io_shutdown shutdown; struct io_rename rename; struct io_unlink unlink; + struct io_getdents64 getdents64; /* use only after cleaning per-op data, see io_clean_op() */ struct io_completion compl; }; @@ -987,6 +994,11 @@ static const struct io_op_def io_op_defs[] = { .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_GETDENTS64] = { + .needs_file = 1, + .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | + IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, + }, }; enum io_mem_account { @@ -4552,6 +4564,40 @@ static int io_sync_file_range(struct io_kiocb *req, bool force_nonblock) return 0; } +static int io_getdents64_prep(struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_getdents64 *getdents64 = &req->getdents64; + + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) + return -EINVAL; + if (sqe->ioprio || sqe->off || sqe->rw_flags || sqe->buf_index) + return -EINVAL; + + getdents64->dirent = u64_to_user_ptr(READ_ONCE(sqe->addr)); + getdents64->count = READ_ONCE(sqe->len); + return 0; +} + +static int io_getdents64(struct io_kiocb *req, bool force_nonblock) +{ + struct io_getdents64 *getdents64 = &req->getdents64; + int ret; + + /* getdents64 always requires a blocking context */ + if (force_nonblock) + return -EAGAIN; + + ret = vfs_getdents64(req->file, getdents64->dirent, getdents64->count); + if (ret < 0) { + if (ret == -ERESTARTSYS) + ret = -EINTR; + req_set_fail_links(req); + } + io_req_complete(req, ret); + return 0; +} + #if defined(CONFIG_NET) static int io_setup_async_msg(struct io_kiocb *req, struct io_async_msghdr *kmsg) @@ -6078,6 +6124,8 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return io_renameat_prep(req, sqe); case IORING_OP_UNLINKAT: return io_unlinkat_prep(req, sqe); + case IORING_OP_GETDENTS64: + return io_getdents64_prep(req, sqe); } printk_once(KERN_WARNING "io_uring: unhandled opcode %d\n", @@ -6337,6 +6385,9 @@ static int io_issue_sqe(struct io_kiocb *req, bool force_nonblock, case IORING_OP_UNLINKAT: ret = io_unlinkat(req, force_nonblock); break; + case IORING_OP_GETDENTS64: + ret = io_getdents64(req, force_nonblock); + break; default: ret = -EINVAL; break; diff --git a/fs/readdir.c b/fs/readdir.c index 19434b3c982c..5310677d5d36 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -348,10 +348,9 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EFAULT; } -SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) +int vfs_getdents64(struct file *file, struct linux_dirent64 __user *dirent, + unsigned int count) { - struct fd f; struct getdents_callback64 buf = { .ctx.actor = filldir64, .count = count, @@ -359,11 +358,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, }; int error; - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); + error = iterate_dir(file, &buf.ctx); if (error >= 0) error = buf.error; if (buf.prev_reclen) { @@ -376,6 +371,20 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, else error = count - buf.count; } + return error; +} + +SYSCALL_DEFINE3(getdents64, unsigned int, fd, + struct linux_dirent64 __user *, dirent, unsigned int, count) +{ + struct fd f; + int error; + + f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + error = vfs_getdents64(f.file, dirent, count); fdput_pos(f); return error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index fd47deea7c17..602202a8fc1f 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3109,6 +3109,10 @@ extern const struct inode_operations simple_symlink_inode_operations; extern int iterate_dir(struct file *, struct dir_context *); +struct linux_dirent64; +int vfs_getdents64(struct file *file, struct linux_dirent64 __user *dirent, + unsigned int count); + int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags); int vfs_fstat(int fd, struct kstat *stat); diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index d31a2a1e8ef9..5602414735f7 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -137,6 +137,7 @@ enum { IORING_OP_SHUTDOWN, IORING_OP_RENAMEAT, IORING_OP_UNLINKAT, + IORING_OP_GETDENTS64, /* this goes last, obviously */ IORING_OP_LAST,