On Fri, Aug 2, 2024 at 2:52 PM Bernd Schubert <bschubert@xxxxxxx> wrote: > > Read/writes IOs should be page aligned as fuse server I think you meant just "write IOs" here and not reads? > might need to copy data to another buffer otherwise in > order to fulfill network or device storage requirements. > > Simple reproducer is with libfuse, example/passthrough* > and opening a file with O_DIRECT - without this change > writing to that file failed with -EINVAL if the underlying > file system was requiring alignment. > > Required server side changes: > Server needs to seek to the next page, without splice that is > just page size buffer alignment, with splice another splice > syscall is needed to seek over the unaligned area. > > Signed-off-by: Bernd Schubert <bschubert@xxxxxxx> > > --- > > Changes since v1: > - Fuse client does not send the alignment offset anymore in the write > header. > - Use FOPEN_ALIGNED_WRITES to be filled in by FUSE_OPEN and FUSE_CREATE > instead of a FUSE_INIT flag to allow control per file and to safe > init flags. > - Instead of seeking a fixed offset, fuse_copy_align() just seeks to the > next page. > - Added sanity checks in fuse_copy_align(). > > libfuse patch: > https://github.com/libfuse/libfuse/pull/983 > > From implmentation point of view it is debatable if request type > parsing should be done in fuse_copy_args() or if alignment > should be stored in struct fuse_arg / fuse_in_arg. > --- > fs/fuse/dev.c | 25 +++++++++++++++++++++++-- > fs/fuse/file.c | 6 ++++++ > fs/fuse/fuse_i.h | 6 ++++-- > include/uapi/linux/fuse.h | 9 ++++++++- > 4 files changed, 41 insertions(+), 5 deletions(-) > > diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c > index 9eb191b5c4de..e0db408db90f 100644 > --- a/fs/fuse/dev.c > +++ b/fs/fuse/dev.c > @@ -1009,6 +1009,24 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) > return 0; > } > > +/* Align to the next page */ > +static int fuse_copy_align(struct fuse_copy_state *cs) > +{ > + if (WARN_ON(!cs->write)) > + return -EIO; I understand why you have the check here but in my opinion, fuse_copy_align should just be a generic api since the rest of the arg copying APIs are generic and with having "align" as a bit in the args field, align capability seems generic as well. > + > + if (WARN_ON(cs->move_pages)) > + return -EIO; > + > + if (WARN_ON(cs->len == PAGE_SIZE || cs->offset == 0)) > + return -EIO; > + > + /* Seek to the next page */ > + cs->offset += cs->len; > + cs->len = 0; > + return 0; > +} > + > /* Copy request arguments to/from userspace buffer */ > static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, > unsigned argpages, struct fuse_arg *args, > @@ -1019,10 +1037,13 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, > > for (i = 0; !err && i < numargs; i++) { > struct fuse_arg *arg = &args[i]; > - if (i == numargs - 1 && argpages) > + if (i == numargs - 1 && argpages) { > err = fuse_copy_pages(cs, arg->size, zeroing); > - else > + } else { > err = fuse_copy_one(cs, arg->value, arg->size); > + if (!err && arg->align) > + err = fuse_copy_align(cs); > + } > } > return err; > } > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index f39456c65ed7..931e7324137f 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -1062,6 +1062,12 @@ static void fuse_write_args_fill(struct fuse_io_args *ia, struct fuse_file *ff, > args->in_args[0].size = FUSE_COMPAT_WRITE_IN_SIZE; > else > args->in_args[0].size = sizeof(ia->write.in); > + > + if (ff->open_flags & FOPEN_ALIGNED_WRITES) { > + args->in_args[0].align = 1; This is more of a nit so feel free to disregard, but I think the code is easier to understand if the align bit gets set on the arg that needs alignment instead of on the preceding arg. I think this would also make fuse_copy_args() easier to grok, since the alignment would be done before doing fuse_copy_pages(), which makes it more obvious that the alignment is for the copied out pages > + ia->write.in.write_flags |= FUSE_WRITE_ALIGNED; > + } > + > args->in_args[0].value = &ia->write.in; > args->in_args[1].size = count; > args->out_numargs = 1; > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index f23919610313..1600bd7b1d0d 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -275,13 +275,15 @@ struct fuse_file { > > /** One input argument of a request */ > struct fuse_in_arg { > - unsigned size; > + unsigned int size; > + unsigned int align:1; > const void *value; > }; > > /** One output argument of a request */ > struct fuse_arg { > - unsigned size; > + unsigned int size; > + unsigned int align:1; > void *value; > }; > > diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h > index d08b99d60f6f..742262c7c1eb 100644 > --- a/include/uapi/linux/fuse.h > +++ b/include/uapi/linux/fuse.h > @@ -217,6 +217,9 @@ > * - add backing_id to fuse_open_out, add FOPEN_PASSTHROUGH open flag > * - add FUSE_NO_EXPORT_SUPPORT init flag > * - add FUSE_NOTIFY_RESEND, add FUSE_HAS_RESEND init flag > + * > + * 7.41 > + * - add FOPEN_ALIGNED_WRITES open flag and FUSE_WRITE_ALIGNED write flag > */ > > #ifndef _LINUX_FUSE_H > @@ -252,7 +255,7 @@ > #define FUSE_KERNEL_VERSION 7 > > /** Minor version number of this interface */ > -#define FUSE_KERNEL_MINOR_VERSION 40 > +#define FUSE_KERNEL_MINOR_VERSION 41 > > /** The node ID of the root inode */ > #define FUSE_ROOT_ID 1 > @@ -360,6 +363,7 @@ struct fuse_file_lock { > * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) > * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode > * FOPEN_PASSTHROUGH: passthrough read/write io for this open file > + * FOPEN_ALIGNED_WRITES: Page align write io data > */ > #define FOPEN_DIRECT_IO (1 << 0) > #define FOPEN_KEEP_CACHE (1 << 1) > @@ -369,6 +373,7 @@ struct fuse_file_lock { > #define FOPEN_NOFLUSH (1 << 5) > #define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) > #define FOPEN_PASSTHROUGH (1 << 7) > +#define FOPEN_ALIGNED_WRITES (1 << 8) Nice, I like how you made the flag on open instead of init to allow this option to be file-specific. > > /** > * INIT request/reply flags > @@ -496,10 +501,12 @@ struct fuse_file_lock { > * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed > * FUSE_WRITE_LOCKOWNER: lock_owner field is valid > * FUSE_WRITE_KILL_SUIDGID: kill suid and sgid bits > + * FUSE_WRITE_ALIGNED: Write io data are page aligned > */ > #define FUSE_WRITE_CACHE (1 << 0) > #define FUSE_WRITE_LOCKOWNER (1 << 1) > #define FUSE_WRITE_KILL_SUIDGID (1 << 2) > +#define FUSE_WRITE_ALIGNED (1 << 3) > > /* Obsolete alias; this flag implies killing suid/sgid only. */ > #define FUSE_WRITE_KILL_PRIV FUSE_WRITE_KILL_SUIDGID > -- > 2.43.0 > Regarding where/how alignment should be stored, eg request type parsing vs in fuse_arg/fuse_in_arg structs - I don't feel strongly about this but it feels cleaner to me to do request type parsing given that alignment is only needed for fuse write requests. In my mind, it would look something like this: --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1012,17 +1012,21 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) /* Copy request arguments to/from userspace buffer */ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, unsigned argpages, struct fuse_arg *args, - int zeroing) + int zeroing, bool align) { int err = 0; unsigned i; for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; - if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); - else + if (i == numargs - 1 && argpages) { + if (align) + err = fuse_copy_align(cs); + if (!err) + err = fuse_copy_pages(cs, arg->size, zeroing); + } else { err = fuse_copy_one(cs, arg->value, arg->size); + } } return err; } +static bool should_align_copy_pages(struct file *file, struct fuse_args *args) +{ + struct fuse_file *ff = file->private_data; + + return (ff->open_flags & FOPEN_ALIGNED_WRITES) && args->opcode == FUSE_WRITE; +} + @@ -1212,6 +1223,7 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_args *args; unsigned reqsize; unsigned int hash; + bool align; /* * Require sane minimum read buffer - that has capacity for fixed part @@ -1296,9 +1308,10 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &req->in.h, sizeof(req->in.h)); + align = should_align_copy_pages(file, args); if (!err) err = fuse_copy_args(cs, args->in_numargs, args->in_pages, - (struct fuse_arg *) args->in_args, 0); + (struct fuse_arg *) args->in_args, 0, align); fuse_copy_finish(cs); spin_lock(&fpq->lock); clear_bit(FR_LOCKED, &req->flags); @@ -1896,7 +1909,7 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, lastarg->size -= diffsize; } return fuse_copy_args(cs, args->out_numargs, args->out_pages, - args->out_args, args->page_zeroing); + args->out_args, args->page_zeroing, false); } which also seems to me easier to follow in logic than having the align bit in the args. But if align will be something that other operations will also need/request, then I think it makes sense to have it in the args. Thanks, Joanne