Re: [PATCH] fuse: allow filesystems to have precise control over data cache

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 16.03.2019 00:26, Kirill Smelkov wrote:
> On networked filesystems file data can be changed externally.
> FUSE provides notification messages for filesystem to inform kernel that
> metadata or data region of a file needs to be invalidated in local page
> cache. That provides the basis for filesystem implementations to
> invalidate kernel cache precisely based on observed filesystem-specific
> events.
> 
> FUSE has also "automatic" invalidation mode(*) when the kernel
> automatically invalidates data cache of a file if it sees mtime change.
> It also automatically invalidates whole data cache of a file if it sees
> file size being changed.
> 
> The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
> However, due to probably historical reason, that capability controls
> only whether mtime change should be resulting in automatic invalidation
> or not. A change in file size always results in invalidating whole data
> cache of a file irregardless of whether FUSE_AUTO_INVAL_DATA was
> negotiated(+).
> 
> The filesystem I write[1] represents data arrays stored in networked
> database as local files suitable for mmap. It is read-only filesystem -
> changes to data are committed externally via database interfaces and the
> filesystem only glues data into contiguous file streams suitable for
> mmap and traditional array processing. The files are big - starting from
> hundreds gigabytes and more. The files change regularly, and frequently
> by data being appended to their end. The size of files thus changes
> frequently.
> 
> If a file was accessed locally and some part of its data got into page
> cache, we want that data to stay cached unless there is memory pressure,
> or unless corresponding part of the file was actually changed. However
> current FUSE behaviour - when it sees file size change - is to
> invalidate the whole file. The data cache of the file is thus completely
> lost even on small size change, and despite that the filesystem server is
> careful to accurately translate database changes into FUSE invalidation
> messages to kernel.
> 
> Let's fix it: if a filesystem, through new FUSE_PRECISE_INVAL_DATA
> capability, indicates to kernel that it is fully responsible for data
> cache invalidation, then the kernel won't invalidate files data cache on
> size change and only truncate that cache to new size in case the size
> decreased.
> 
> (*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag",
> eed2179efe "fuse: invalidate inode mapping if mtime changes"
> 
> (+) in writeback mode the kernel does not invalidate data cache on file
> size change, but neither it allows the filesystem to set the size due to
> external event (see 8373200b12 "fuse: Trust kernel i_size only")
> 
> [1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20
> 
> Signed-off-by: Kirill Smelkov <kirr@xxxxxxxxxx>
> ---
>  fs/fuse/fuse_i.h          |  3 +++
>  fs/fuse/inode.c           | 12 ++++++++++--
>  include/uapi/linux/fuse.h |  7 ++++++-
>  3 files changed, 19 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
> index 0920c0c032a0..ca439c72b509 100644
> --- a/fs/fuse/fuse_i.h
> +++ b/fs/fuse/fuse_i.h
> @@ -690,6 +690,9 @@ struct fuse_conn {
>  	/** Use enhanced/automatic page cache invalidation. */
>  	unsigned auto_inval_data:1;
>  
> +	/** Filesystem is fully reponsible for page cache invalidation. */
> +	unsigned precise_inval_data:1;
> +
>  	/** Does the filesystem support readdirplus? */
>  	unsigned do_readdirplus:1;
>  
> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
> index 1b3f3b67d9f0..46acd19613b9 100644
> --- a/fs/fuse/inode.c
> +++ b/fs/fuse/inode.c
> @@ -237,7 +237,8 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
>  
>  		if (oldsize != attr->size) {
>  			truncate_pagecache(inode, attr->size);
> -			inval = true;
> +			if (!fc->precise_inval_data)
> +				inval = true;
>  		} else if (fc->auto_inval_data) {
>  			struct timespec64 new_mtime = {
>  				.tv_sec = attr->mtime,
> @@ -912,6 +913,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
>  				fc->dont_mask = 1;
>  			if (arg->flags & FUSE_AUTO_INVAL_DATA)
>  				fc->auto_inval_data = 1;
> +			if (arg->flags & FUSE_PRECISE_INVAL_DATA)
> +				fc->precise_inval_data = 1;
> +			if (fc->auto_inval_data && fc->precise_inval_data) {
> +				printk(KERN_WARNING "fuse: filesystem requested both "

pr_warn() here?

> +					"auto and precise cache control - using auto\n");
> +				fc->precise_inval_data = 0;
> +			}
>  			if (arg->flags & FUSE_DO_READDIRPLUS) {
>  				fc->do_readdirplus = 1;
>  				if (arg->flags & FUSE_READDIRPLUS_AUTO)
> @@ -973,7 +981,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
>  		FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT |
>  		FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL |
>  		FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS |
> -		FUSE_NO_OPENDIR_SUPPORT;
> +		FUSE_NO_OPENDIR_SUPPORT | FUSE_PRECISE_INVAL_DATA;
>  	req->in.h.opcode = FUSE_INIT;
>  	req->in.numargs = 1;
>  	req->in.args[0].size = sizeof(*arg);
> diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
> index 2ac598614a8f..33de8f6391ec 100644
> --- a/include/uapi/linux/fuse.h
> +++ b/include/uapi/linux/fuse.h
> @@ -125,6 +125,9 @@
>   *
>   *  7.29
>   *  - add FUSE_NO_OPENDIR_SUPPORT flag
> + *
> + *  7.30
> + *  - add FUSE_PRECISE_INVAL_DATA
>   */
>  
>  #ifndef _LINUX_FUSE_H
> @@ -160,7 +163,7 @@
>  #define FUSE_KERNEL_VERSION 7
>  
>  /** Minor version number of this interface */
> -#define FUSE_KERNEL_MINOR_VERSION 29
> +#define FUSE_KERNEL_MINOR_VERSION 30
>  
>  /** The node ID of the root inode */
>  #define FUSE_ROOT_ID 1
> @@ -263,6 +266,7 @@ struct fuse_file_lock {
>   * FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages
>   * FUSE_CACHE_SYMLINKS: cache READLINK responses
>   * FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
> + * FUSE_PRECISE_INVAL_DATA: filesystem is fully responsible for data cache invalidation
>   */
>  #define FUSE_ASYNC_READ		(1 << 0)
>  #define FUSE_POSIX_LOCKS	(1 << 1)
> @@ -289,6 +293,7 @@ struct fuse_file_lock {
>  #define FUSE_MAX_PAGES		(1 << 22)
>  #define FUSE_CACHE_SYMLINKS	(1 << 23)
>  #define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
> +#define FUSE_PRECISE_INVAL_DATA (1 << 25)
>  
>  /**
>   * CUSE INIT request/reply flags
> 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux