Hi, On 9/3/2024 4:44 PM, Jingbo Xu wrote: > > On 8/31/24 5:37 PM, Hou Tao wrote: >> From: Hou Tao <houtao1@xxxxxxxxxx> >> >> When trying to insert a 10MB kernel module kept in a virtio-fs with cache >> disabled, the following warning was reported: >> SNIP >> >> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") >> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx> > Tested-by: Jingbo Xu <jefflexu@xxxxxxxxxxxxxxxxx> Thanks for the test. > > >> --- >> fs/fuse/file.c | 62 +++++++++++++++++++++++++++++++-------------- >> fs/fuse/fuse_i.h | 6 +++++ >> fs/fuse/virtio_fs.c | 1 + >> 3 files changed, 50 insertions(+), 19 deletions(-) >> >> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >> index f39456c65ed7..331208d3e4d1 100644 >> --- a/fs/fuse/file.c >> +++ b/fs/fuse/file.c >> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos, >> args->out_args[0].size = count; >> } >> >> - SNIP >> static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, >> size_t *nbytesp, int write, >> - unsigned int max_pages) >> + unsigned int max_pages, >> + bool use_pages_for_kvec_io) >> { >> + bool flush_or_invalidate = false; >> size_t nbytes = 0; /* # bytes already packed in req */ >> ssize_t ret = 0; >> >> - /* Special case for kernel I/O: can copy directly into the buffer */ >> + /* Special case for kernel I/O: can copy directly into the buffer. >> + * However if the implementation of fuse_conn requires pages instead of >> + * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead. >> + */ >> if (iov_iter_is_kvec(ii)) { >> - unsigned long user_addr = fuse_get_user_addr(ii); >> - size_t frag_size = fuse_get_frag_size(ii, *nbytesp); >> + void *user_addr = (void *)fuse_get_user_addr(ii); >> >> - if (write) >> - ap->args.in_args[1].value = (void *) user_addr; >> - else >> - ap->args.out_args[0].value = (void *) user_addr; >> + if (!use_pages_for_kvec_io) { >> + size_t frag_size = fuse_get_frag_size(ii, *nbytesp); >> >> - iov_iter_advance(ii, frag_size); >> - *nbytesp = frag_size; >> - return 0; >> + if (write) >> + ap->args.in_args[1].value = user_addr; >> + else >> + ap->args.out_args[0].value = user_addr; >> + >> + iov_iter_advance(ii, frag_size); >> + *nbytesp = frag_size; >> + return 0; >> + } >> + >> + if (is_vmalloc_addr(user_addr)) { >> + ap->args.vmap_base = user_addr; >> + flush_or_invalidate = true; > Could we move flush_kernel_vmap_range() upon here, so that > flush_or_invalidate is not needed anymore and the code looks cleaner? flush_kernel_vmap_range() needs to know the length of the flushed area, if moving it here(), the length will be unknown. > >> + } >> } >> >> while (nbytes < *nbytesp && ap->num_pages < max_pages) { >> @@ -1513,6 +1533,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, >> (PAGE_SIZE - ret) & (PAGE_SIZE - 1); >> } >> >> + if (write && flush_or_invalidate) >> + flush_kernel_vmap_range(ap->args.vmap_base, nbytes); >> + >> + ap->args.invalidate_vmap = !write && flush_or_invalidate; > How about initializing vmap_base only when the data buffer is vmalloced > and it's a read request? In this case invalidate_vmap is no longer needed. You mean using the value of vmap_base to indicate whether invalidation is needed or not, right ? I prefer to keep it, because the extra variable invalidate_vmap indicates the required action for the vmap area and it doesn't increase the size of fuse_args. > >> ap->args.is_pinned = iov_iter_extract_will_pin(ii); >> ap->args.user_pages = true; >> if (write) >> @@ -1581,7 +1605,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, >> size_t nbytes = min(count, nmax); >> >> err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write, >> - max_pages); >> + max_pages, fc->use_pages_for_kvec_io); >> if (err && !nbytes) >> break; >> >> @@ -1595,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, >> } >> >> if (!io->async || nres < 0) { >> - fuse_release_user_pages(&ia->ap, io->should_dirty); >> + fuse_release_user_pages(&ia->ap, nres, io->should_dirty); >> fuse_io_free(ia); >> } >> ia = NULL; >> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >> index f23919610313..79add14c363f 100644 >> --- a/fs/fuse/fuse_i.h >> +++ b/fs/fuse/fuse_i.h >> @@ -309,9 +309,12 @@ struct fuse_args { >> bool may_block:1; >> bool is_ext:1; >> bool is_pinned:1; >> + bool invalidate_vmap:1; >> struct fuse_in_arg in_args[3]; >> struct fuse_arg out_args[2]; >> void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); >> + /* Used for kvec iter backed by vmalloc address */ >> + void *vmap_base; >> }; >> >> struct fuse_args_pages { >> @@ -860,6 +863,9 @@ struct fuse_conn { >> /** Passthrough support for read/write IO */ >> unsigned int passthrough:1; >> >> + /* Use pages instead of pointer for kernel I/O */ >> + unsigned int use_pages_for_kvec_io:1; > Maybe we need a better (actually shorter) name for this flag. kvec_pages? Naming is hard. The name "use_pages_for_kvec_io" is verbose indeed. kvec_pages is better. Will update it in the next spin. > >> + >> /** Maximum stack depth for passthrough backing files */ >> int max_stack_depth; >> >> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c >> index dd5260141615..43d66ab5e891 100644 >> --- a/fs/fuse/virtio_fs.c >> +++ b/fs/fuse/virtio_fs.c >> @@ -1568,6 +1568,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc) >> fc->delete_stale = true; >> fc->auto_submounts = true; >> fc->sync_fs = true; >> + fc->use_pages_for_kvec_io = true; >> >> /* Tell FUSE to split requests that exceed the virtqueue's size */ >> fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,