Re: [PATCH v4 1/2] virtiofs: use pages instead of pointer for kernel direct IO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi,

On 9/3/2024 4:44 PM, Jingbo Xu wrote:
>
> On 8/31/24 5:37 PM, Hou Tao wrote:
>> From: Hou Tao <houtao1@xxxxxxxxxx>
>>
>> When trying to insert a 10MB kernel module kept in a virtio-fs with cache
>> disabled, the following warning was reported:
>>

SNIP
>>
>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem")
>> Signed-off-by: Hou Tao <houtao1@xxxxxxxxxx>
> Tested-by: Jingbo Xu <jefflexu@xxxxxxxxxxxxxxxxx>

Thanks for the test.
>
>
>> ---
>>  fs/fuse/file.c      | 62 +++++++++++++++++++++++++++++++--------------
>>  fs/fuse/fuse_i.h    |  6 +++++
>>  fs/fuse/virtio_fs.c |  1 +
>>  3 files changed, 50 insertions(+), 19 deletions(-)
>>
>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
>> index f39456c65ed7..331208d3e4d1 100644
>> --- a/fs/fuse/file.c
>> +++ b/fs/fuse/file.c
>> @@ -645,7 +645,7 @@ void fuse_read_args_fill(struct fuse_io_args *ia, struct file *file, loff_t pos,
>>  	args->out_args[0].size = count;
>>  }
>>  
>> -

SNIP
>>  static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
>>  			       size_t *nbytesp, int write,
>> -			       unsigned int max_pages)
>> +			       unsigned int max_pages,
>> +			       bool use_pages_for_kvec_io)
>>  {
>> +	bool flush_or_invalidate = false;
>>  	size_t nbytes = 0;  /* # bytes already packed in req */
>>  	ssize_t ret = 0;
>>  
>> -	/* Special case for kernel I/O: can copy directly into the buffer */
>> +	/* Special case for kernel I/O: can copy directly into the buffer.
>> +	 * However if the implementation of fuse_conn requires pages instead of
>> +	 * pointer (e.g., virtio-fs), use iov_iter_extract_pages() instead.
>> +	 */
>>  	if (iov_iter_is_kvec(ii)) {
>> -		unsigned long user_addr = fuse_get_user_addr(ii);
>> -		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>> +		void *user_addr = (void *)fuse_get_user_addr(ii);
>>  
>> -		if (write)
>> -			ap->args.in_args[1].value = (void *) user_addr;
>> -		else
>> -			ap->args.out_args[0].value = (void *) user_addr;
>> +		if (!use_pages_for_kvec_io) {
>> +			size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
>>  
>> -		iov_iter_advance(ii, frag_size);
>> -		*nbytesp = frag_size;
>> -		return 0;
>> +			if (write)
>> +				ap->args.in_args[1].value = user_addr;
>> +			else
>> +				ap->args.out_args[0].value = user_addr;
>> +
>> +			iov_iter_advance(ii, frag_size);
>> +			*nbytesp = frag_size;
>> +			return 0;
>> +		}
>> +
>> +		if (is_vmalloc_addr(user_addr)) {
>> +			ap->args.vmap_base = user_addr;
>> +			flush_or_invalidate = true;
> Could we move flush_kernel_vmap_range() upon here, so that
> flush_or_invalidate is not needed anymore and the code looks cleaner?

flush_kernel_vmap_range() needs to know the length of the flushed area,
if moving it here(), the length will be unknown.
>
>> +		}
>>  	}
>>  
>>  	while (nbytes < *nbytesp && ap->num_pages < max_pages) {
>> @@ -1513,6 +1533,10 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii,
>>  			(PAGE_SIZE - ret) & (PAGE_SIZE - 1);
>>  	}
>>  
>> +	if (write && flush_or_invalidate)
>> +		flush_kernel_vmap_range(ap->args.vmap_base, nbytes);
>> +
>> +	ap->args.invalidate_vmap = !write && flush_or_invalidate;
> How about initializing vmap_base only when the data buffer is vmalloced
> and it's a read request?  In this case invalidate_vmap is no longer needed.

You mean using the value of vmap_base to indicate whether invalidation
is needed or not, right ? I prefer to keep it, because the extra
variable invalidate_vmap indicates the required action for the vmap area
and it doesn't increase the size of fuse_args.

>
>>  	ap->args.is_pinned = iov_iter_extract_will_pin(ii);
>>  	ap->args.user_pages = true;
>>  	if (write)
>> @@ -1581,7 +1605,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
>>  		size_t nbytes = min(count, nmax);
>>  
>>  		err = fuse_get_user_pages(&ia->ap, iter, &nbytes, write,
>> -					  max_pages);
>> +					  max_pages, fc->use_pages_for_kvec_io);
>>  		if (err && !nbytes)
>>  			break;
>>  
>> @@ -1595,7 +1619,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
>>  		}
>>  
>>  		if (!io->async || nres < 0) {
>> -			fuse_release_user_pages(&ia->ap, io->should_dirty);
>> +			fuse_release_user_pages(&ia->ap, nres, io->should_dirty);
>>  			fuse_io_free(ia);
>>  		}
>>  		ia = NULL;
>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
>> index f23919610313..79add14c363f 100644
>> --- a/fs/fuse/fuse_i.h
>> +++ b/fs/fuse/fuse_i.h
>> @@ -309,9 +309,12 @@ struct fuse_args {
>>  	bool may_block:1;
>>  	bool is_ext:1;
>>  	bool is_pinned:1;
>> +	bool invalidate_vmap:1;
>>  	struct fuse_in_arg in_args[3];
>>  	struct fuse_arg out_args[2];
>>  	void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
>> +	/* Used for kvec iter backed by vmalloc address */
>> +	void *vmap_base;
>>  };
>>  
>>  struct fuse_args_pages {
>> @@ -860,6 +863,9 @@ struct fuse_conn {
>>  	/** Passthrough support for read/write IO */
>>  	unsigned int passthrough:1;
>>  
>> +	/* Use pages instead of pointer for kernel I/O */
>> +	unsigned int use_pages_for_kvec_io:1;
> Maybe we need a better (actually shorter) name for this flag. kvec_pages?

Naming is hard. The name "use_pages_for_kvec_io" is verbose indeed.
kvec_pages is better. Will update it in the next spin.
>
>> +
>>  	/** Maximum stack depth for passthrough backing files */
>>  	int max_stack_depth;
>>  
>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c
>> index dd5260141615..43d66ab5e891 100644
>> --- a/fs/fuse/virtio_fs.c
>> +++ b/fs/fuse/virtio_fs.c
>> @@ -1568,6 +1568,7 @@ static int virtio_fs_get_tree(struct fs_context *fsc)
>>  	fc->delete_stale = true;
>>  	fc->auto_submounts = true;
>>  	fc->sync_fs = true;
>> +	fc->use_pages_for_kvec_io = true;
>>  
>>  	/* Tell FUSE to split requests that exceed the virtqueue's size */
>>  	fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit,





[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux