Re: [PATCH RFC 02/10] fs/locks: Export F_LAYOUT lease to user space

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, 2019-06-05 at 18:45 -0700, ira.weiny@xxxxxxxxx wrote:
> From: Ira Weiny <ira.weiny@xxxxxxxxx>
> 
> GUP longterm pins of non-pagecache file system pages (eg FS DAX) are
> currently disallowed because they are unsafe.
> 
> The danger for pinning these pages comes from the fact that hole punch
> and/or truncate of those files results in the pages being mapped and
> pinned by a user space process while DAX has potentially allocated those
> pages to other processes.
> 
> Most (All) users who are mapping FS DAX pages for long term pin purposes
> (such as RDMA) are not going to want to deallocate these pages while
> those pages are in use.  To do so would mean the application would lose
> data.  So the use case for allowing truncate operations of such pages
> is limited.
> 
> However, the kernel must protect itself and users from potential
> mistakes and/or malicious user space code.  Rather than disabling long
> term pins as is done now.   Allow for users who know they are going to
> be pinning this memory to alert the file system of this intention.
> Furthermore, allow users to be alerted such that they can react if a
> truncate operation occurs for some reason.
> 
> Example user space pseudocode for a user using RDMA and wanting to allow
> a truncate would look like this:
> 
> lease_break_sigio_handler() {
> ...
> 	if (sigio.fd == rdma_fd) {
> 		complete_rdma_operations(...);
> 		ibv_dereg_mr(mr);
> 		close(rdma_fd);
> 		fcntl(rdma_fd, F_SETLEASE, F_UNLCK);
> 	}
> }
> 
> setup_rdma_to_dax_file() {
> ...
> 	rdma_fd = open(...)
> 	fcntl(rdma_fd, F_SETLEASE, F_LAYOUT);

I'm not crazy about this interface. F_LAYOUT doesn't seem to be in the
same category as F_RDLCK/F_WRLCK/F_UNLCK.

Maybe instead of F_SETLEASE, this should use new
F_SETLAYOUT/F_GETLAYOUT cmd values? There is nothing that would prevent
you from setting both a lease and a layout on a file, and indeed knfsd
can set both.

This interface seems to conflate the two.

> 	sigaction(SIGIO, ...  lease_break ...);
> 	ptr = mmap(rdma_fd, ...);
> 	mr = ibv_reg_mr(ptr, ...);
> 	do_rdma_stuff(...);
> }
> 
> Follow on patches implement the notification of the lease holder on
> truncate as well as failing the truncate if the GUP pin is not released.
> 
> This first patch exports the F_LAYOUT lease type and allows the user to set
> and get it.
> 
> After the complete series:
> 
> 1) Failure to obtain a F_LAYOUT lease on an open FS DAX file will result
>    in a failure to GUP pin any pages in that file.  An example of a call
>    which results in GUP pin is ibv_reg_mr().
> 2) While the GUP pin is in place (eg MR is in use) truncates of the
>    affected pages will fail.
> 3) If the user registers a sigaction they will be notified of the
>    truncate so they can react.  Failure to react will result in the
>    lease being revoked after <sysfs>/lease-break-time seconds.  After
>    this time new GUP pins will fail without a new lease being taken.
> 4) A truncate will work if the pages being truncated are not actively
>    pinned at the time of truncate.  Attempts to pin these pages after
>    will result in a failure.
> 
> Signed-off-by: Ira Weiny <ira.weiny@xxxxxxxxx>
> ---
>  fs/locks.c                       | 36 +++++++++++++++++++++++++++-----
>  include/linux/fs.h               |  2 +-
>  include/uapi/asm-generic/fcntl.h |  3 +++
>  3 files changed, 35 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/locks.c b/fs/locks.c
> index 0cc2b9f30e22..de9761c068de 100644
> --- a/fs/locks.c
> +++ b/fs/locks.c
> @@ -191,6 +191,8 @@ static int target_leasetype(struct file_lock *fl)
>  		return F_UNLCK;
>  	if (fl->fl_flags & FL_DOWNGRADE_PENDING)
>  		return F_RDLCK;
> +	if (fl->fl_flags & FL_LAYOUT)
> +		return F_LAYOUT;
>  	return fl->fl_type;
>  }
>  
> @@ -611,7 +613,8 @@ static const struct lock_manager_operations lease_manager_ops = {
>  /*
>   * Initialize a lease, use the default lock manager operations
>   */
> -static int lease_init(struct file *filp, long type, struct file_lock *fl)
> +static int lease_init(struct file *filp, long type, unsigned int flags,
> +		      struct file_lock *fl)
>  {
>  	if (assign_type(fl, type) != 0)
>  		return -EINVAL;
> @@ -621,6 +624,8 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
>  
>  	fl->fl_file = filp;
>  	fl->fl_flags = FL_LEASE;
> +	if (flags & FL_LAYOUT)
> +		fl->fl_flags |= FL_LAYOUT;
>  	fl->fl_start = 0;
>  	fl->fl_end = OFFSET_MAX;
>  	fl->fl_ops = NULL;
> @@ -629,7 +634,8 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
>  }
>  
>  /* Allocate a file_lock initialised to this type of lease */
> -static struct file_lock *lease_alloc(struct file *filp, long type)
> +static struct file_lock *lease_alloc(struct file *filp, long type,
> +				     unsigned int flags)
>  {
>  	struct file_lock *fl = locks_alloc_lock();
>  	int error = -ENOMEM;
> @@ -637,7 +643,7 @@ static struct file_lock *lease_alloc(struct file *filp, long type)
>  	if (fl == NULL)
>  		return ERR_PTR(error);
>  
> -	error = lease_init(filp, type, fl);
> +	error = lease_init(filp, type, flags, fl);
>  	if (error) {
>  		locks_free_lock(fl);
>  		return ERR_PTR(error);
> @@ -1588,7 +1594,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
>  	int want_write = (mode & O_ACCMODE) != O_RDONLY;
>  	LIST_HEAD(dispose);
>  
> -	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK);
> +	new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK, 0);
>  	if (IS_ERR(new_fl))
>  		return PTR_ERR(new_fl);
>  	new_fl->fl_flags = type;
> @@ -1725,6 +1731,8 @@ EXPORT_SYMBOL(lease_get_mtime);
>   *
>   *	%F_UNLCK to indicate no lease is held.
>   *
> + *	%F_LAYOUT to indicate a layout lease is held.
> + *
>   *	(if a lease break is pending):
>   *
>   *	%F_RDLCK to indicate an exclusive lease needs to be
> @@ -2015,8 +2023,26 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
>  	struct file_lock *fl;
>  	struct fasync_struct *new;
>  	int error;
> +	unsigned int flags = 0;
> +
> +	/*
> +	 * NOTE on F_LAYOUT lease
> +	 *
> +	 * LAYOUT lease types are taken on files which the user knows that
> +	 * they will be pinning in memory for some indeterminate amount of
> +	 * time.  Such as for use with RDMA.  While we don't know what user
> +	 * space is going to do with the file we still use a F_RDLOCK level of
> +	 * lease.  This ensures that there are no conflicts between
> +	 * 2 users.  The conflict should only come from the File system wanting
> +	 * to revoke the lease in break_layout()  And this is done by using
> +	 * F_WRLCK in the break code.
> +	 */
> +	if (arg == F_LAYOUT) {
> +		arg = F_RDLCK;
> +		flags = FL_LAYOUT;
> +	}
>  
> -	fl = lease_alloc(filp, arg);
> +	fl = lease_alloc(filp, arg, flags);
>  	if (IS_ERR(fl))
>  		return PTR_ERR(fl);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index f7fdfe93e25d..9e9d8d35ee93 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -998,7 +998,7 @@ static inline struct file *get_file(struct file *f)
>  #define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
>  #define FL_UNLOCK_PENDING	512 /* Lease is being broken */
>  #define FL_OFDLCK	1024	/* lock is "owned" by struct file */
> -#define FL_LAYOUT	2048	/* outstanding pNFS layout */
> +#define FL_LAYOUT	2048	/* outstanding pNFS layout or user held pin */
>  
>  #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
>  
> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> index 9dc0bf0c5a6e..baddd54f3031 100644
> --- a/include/uapi/asm-generic/fcntl.h
> +++ b/include/uapi/asm-generic/fcntl.h
> @@ -174,6 +174,9 @@ struct f_owner_ex {
>  #define F_SHLCK		8	/* or 4 */
>  #endif
>  
> +#define F_LAYOUT	16      /* layout lease to allow longterm pins such as
> +				   RDMA */
> +
>  /* operations for bsd flock(), also used by the kernel implementation */
>  #define LOCK_SH		1	/* shared lock */
>  #define LOCK_EX		2	/* exclusive lock */




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux