On Wed, May 10, 2023 at 07:52:54PM +0900, Dominique Martinet wrote: > This turns out to be very slightly faster than an extra call to > getdents, but in practice it doesn't seem to be such an improvement as > the trailing getdents will return almost immediately be absorbed by the > scheduling noise in a find-like context (my ""server"" is too noisy to > get proper benchmarks out, but results look slightly better with this in > async mode, and almost identical in the NOWAIT path) > > If the user is waiting the end of a single directory though it might be > worth it, so including the patch for comments. > (in particular I'm not really happy that the flag has become in-out for > vfs_getdents, especially when the getdents64 syscall does not use it, > but I don't see much other way around it) > > If this approach is acceptable/wanted then this patch will be split down > further (at least dir_context/vfs_getdents, kernfs, libfs, uring in four > separate commits) > > Signed-off-by: Dominique Martinet <asmadeus@xxxxxxxxxxxxx> > --- > fs/internal.h | 2 +- > fs/kernfs/dir.c | 1 + > fs/libfs.c | 9 ++++++--- > fs/readdir.c | 10 ++++++---- > include/linux/fs.h | 2 ++ > include/uapi/linux/io_uring.h | 2 ++ > io_uring/fs.c | 8 ++++++-- > 7 files changed, 24 insertions(+), 10 deletions(-) > > diff --git a/fs/internal.h b/fs/internal.h > index 0264b001d99a..0b1552c7a870 100644 > --- a/fs/internal.h > +++ b/fs/internal.h > @@ -267,4 +267,4 @@ void mnt_idmap_put(struct mnt_idmap *idmap); > struct linux_dirent64; > > int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent, > - unsigned int count, unsigned long flags); > + unsigned int count, unsigned long *flags); > diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c > index 5a5b3e7881bf..53a6b4804c34 100644 > --- a/fs/kernfs/dir.c > +++ b/fs/kernfs/dir.c > @@ -1860,6 +1860,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) > up_read(&root->kernfs_rwsem); > file->private_data = NULL; > ctx->pos = INT_MAX; > + ctx->flags |= DIR_CONTEXT_F_EOD; > return 0; > } > > diff --git a/fs/libfs.c b/fs/libfs.c > index a3c7e42d90a7..b2a95dadffbd 100644 > --- a/fs/libfs.c > +++ b/fs/libfs.c > @@ -208,10 +208,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx) > p = &next->d_child; > } > spin_lock(&dentry->d_lock); > - if (next) > + if (next) { > list_move_tail(&cursor->d_child, &next->d_child); > - else > + } else { > list_del_init(&cursor->d_child); > + ctx->flags |= DIR_CONTEXT_F_EOD; > + } > spin_unlock(&dentry->d_lock); > dput(next); > > @@ -1347,7 +1349,8 @@ static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) > > static int empty_dir_readdir(struct file *file, struct dir_context *ctx) > { > - dir_emit_dots(file, ctx); > + if (dir_emit_dots(file, ctx)) > + ctx->flags |= DIR_CONTEXT_F_EOD; > return 0; > } > > diff --git a/fs/readdir.c b/fs/readdir.c > index 1311b89d75e1..be75a2154b4f 100644 > --- a/fs/readdir.c > +++ b/fs/readdir.c > @@ -358,14 +358,14 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, > * @file : pointer to file struct of directory > * @dirent : pointer to user directory structure > * @count : size of buffer > - * @flags : additional dir_context flags > + * @flags : pointer to additional dir_context flags > */ > int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent, > - unsigned int count, unsigned long flags) > + unsigned int count, unsigned long *flags) > { > struct getdents_callback64 buf = { > .ctx.actor = filldir64, > - .ctx.flags = flags, > + .ctx.flags = flags ? *flags : 0, > .count = count, > .current_dir = dirent > }; > @@ -384,6 +384,8 @@ int vfs_getdents(struct file *file, struct linux_dirent64 __user *dirent, > else > error = count - buf.count; > } > + if (flags) > + *flags = buf.ctx.flags; > return error; > } > > @@ -397,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, > if (!f.file) > return -EBADF; > > - error = vfs_getdents(f.file, dirent, count, 0); > + error = vfs_getdents(f.file, dirent, count, NULL); > > fdput_pos(f); > return error; > diff --git a/include/linux/fs.h b/include/linux/fs.h > index f7de2b5ca38e..d1e31bccfb4f 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -1723,8 +1723,10 @@ struct dir_context { > * flags for dir_context flags > * DIR_CONTEXT_F_NOWAIT: Request non-blocking iterate > * (requires file->f_mode & FMODE_NOWAIT) > + * DIR_CONTEXT_F_EOD: Signal directory has been fully iterated, set by the fs > */ > #define DIR_CONTEXT_F_NOWAIT 0x1 > +#define DIR_CONTEXT_F_EOD 0x2 > > /* > * These flags let !MMU mmap() govern direct device mapping vs immediate > diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h > index 35d0de18d893..35877132027e 100644 > --- a/include/uapi/linux/io_uring.h > +++ b/include/uapi/linux/io_uring.h > @@ -381,11 +381,13 @@ struct io_uring_cqe { > * IORING_CQE_F_SOCK_NONEMPTY If set, more data to read after socket recv > * IORING_CQE_F_NOTIF Set for notification CQEs. Can be used to distinct > * them from sends. > + * IORING_CQE_F_EOF If set, file or directory has reached end of file. > */ > #define IORING_CQE_F_BUFFER (1U << 0) > #define IORING_CQE_F_MORE (1U << 1) > #define IORING_CQE_F_SOCK_NONEMPTY (1U << 2) > #define IORING_CQE_F_NOTIF (1U << 3) > +#define IORING_CQE_F_EOF (1U << 4) > > enum { > IORING_CQE_BUFFER_SHIFT = 16, > diff --git a/io_uring/fs.c b/io_uring/fs.c > index b15ec81c1ed2..f6222b0148ef 100644 > --- a/io_uring/fs.c > +++ b/io_uring/fs.c > @@ -322,6 +322,7 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags) > { > struct io_getdents *gd = io_kiocb_to_cmd(req, struct io_getdents); > unsigned long getdents_flags = 0; > + u32 cqe_flags = 0; > int ret; > > if (issue_flags & IO_URING_F_NONBLOCK) { > @@ -338,13 +339,16 @@ int io_getdents(struct io_kiocb *req, unsigned int issue_flags) > goto out; > } > > - ret = vfs_getdents(req->file, gd->dirent, gd->count, getdents_flags); > + ret = vfs_getdents(req->file, gd->dirent, gd->count, &getdents_flags); I don't understand how synchronization and updating of f_pos works here. For example, what happens if a concurrent seek happens on the fd while io_uring is using vfs_getdents which calls into iterate_dir() and updates f_pos?