Re: [PATCH 6/6] ceph: perform asynchronous unlink if we have sufficient caps

Jeff Layton <jlayton@xxxxxxxxxx> · Thu, 09 Jan 2020 10:03:17 -0500

On Thu, 2020-01-09 at 22:49 +0800, Yan, Zheng wrote:
> On 1/6/20 11:35 PM, Jeff Layton wrote:
> > From: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > 
> > The MDS is getting a new lock-caching facility that will allow it
> > to cache the necessary locks to allow asynchronous directory operations.
> > Since the CEPH_CAP_FILE_* caps are currently unused on directories,
> > we can repurpose those bits for this purpose.
> > 
> > When performing an unlink, if we have Fx on the parent directory,
> > and CEPH_CAP_DIR_UNLINK (aka Fr), and we know that the dentry being
> > removed is the primary link, then then we can fire off an unlink
> > request immediately and don't need to wait on reply before returning.
> > 
> > In that situation, just fix up the dcache and link count and return
> > immediately after issuing the call to the MDS. This does mean that we
> > need to hold an extra reference to the inode being unlinked, and extra
> > references to the caps to avoid races. Those references are put and
> > error handling is done in the r_callback routine.
> > 
> > If the operation ends up failing, then set a writeback error on the
> > directory inode that can be fetched later by an fsync on the dir.
> > 
> > Since this feature is very new, also add a new module parameter to
> > enable and disable it (default is disabled).
> 
> author of this patch is not me :)
> 

I did the initial one and then you did some heavy modification of it, so
really it's authored by both of us. But ok, I'll keep the authorship. :)

> > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > ---
> >   fs/ceph/caps.c               | 35 ++++++++++++------
> >   fs/ceph/dir.c                | 70 +++++++++++++++++++++++++++++++++---
> >   fs/ceph/inode.c              |  8 ++++-
> >   fs/ceph/super.c              |  4 +++
> >   fs/ceph/super.h              |  3 ++
> >   include/linux/ceph/ceph_fs.h |  9 +++++
> >   6 files changed, 113 insertions(+), 16 deletions(-)
> > 
> > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > index d05717397c2a..7fc87b693ba4 100644
> > --- a/fs/ceph/caps.c
> > +++ b/fs/ceph/caps.c
> > @@ -992,7 +992,11 @@ int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
> >   int __ceph_caps_wanted(struct ceph_inode_info *ci)
> >   {
> >   	int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
> > -	if (!S_ISDIR(ci->vfs_inode.i_mode)) {
> > +	if (S_ISDIR(ci->vfs_inode.i_mode)) {
> > +		/* we want EXCL if holding caps of dir ops */
> > +		if (w & CEPH_CAP_ANY_DIR_OPS)
> > +			w |= CEPH_CAP_FILE_EXCL;
> > +	} else {
> >   		/* we want EXCL if dirty data */
> >   		if (w & CEPH_CAP_FILE_BUFFER)
> >   			w |= CEPH_CAP_FILE_EXCL;
> > @@ -1883,10 +1887,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
> >   			 * revoking the shared cap on every create/unlink
> >   			 * operation.
> >   			 */
> > -			if (IS_RDONLY(inode))
> > +			if (IS_RDONLY(inode)) {
> >   				want = CEPH_CAP_ANY_SHARED;
> > -			else
> > -				want = CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_EXCL;
> > +			} else {
> > +				want = CEPH_CAP_ANY_SHARED |
> > +				       CEPH_CAP_FILE_EXCL |
> > +				       CEPH_CAP_ANY_DIR_OPS;
> > +			}
> >   			retain |= want;
> >   		} else {
> >   
> > @@ -2649,7 +2656,10 @@ static int try_get_cap_refs(struct inode *inode, int need, int want,
> >   				}
> >   				snap_rwsem_locked = true;
> >   			}
> > -			*got = need | (have & want);
> > +			if ((have & want) == want)
> > +				*got = need | want;
> > +			else
> > +				*got = need;
> >   			if (S_ISREG(inode->i_mode) &&
> >   			    (need & CEPH_CAP_FILE_RD) &&
> >   			    !(*got & CEPH_CAP_FILE_CACHE))
> > @@ -2739,13 +2749,16 @@ int ceph_try_get_caps(struct inode *inode, int need, int want,
> >   	int ret;
> >   
> >   	BUG_ON(need & ~CEPH_CAP_FILE_RD);
> > -	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
> > -	ret = ceph_pool_perm_check(inode, need);
> > -	if (ret < 0)
> > -		return ret;
> > +	if (need) {
> > +		ret = ceph_pool_perm_check(inode, need);
> > +		if (ret < 0)
> > +			return ret;
> > +	}
> >   
> > -	ret = try_get_cap_refs(inode, need, want, 0,
> > -			       (nonblock ? NON_BLOCKING : 0), got);
> > +	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO |
> > +			CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL |
> > +			CEPH_CAP_ANY_DIR_OPS));
> > +	ret = try_get_cap_refs(inode, need, want, 0, nonblock, got);
> >   	return ret == -EAGAIN ? 0 : ret;
> >   }
> >   
> > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> > index d0cd0aba5843..10294f07f5f0 100644
> > --- a/fs/ceph/dir.c
> > +++ b/fs/ceph/dir.c
> > @@ -1036,6 +1036,47 @@ static int ceph_link(struct dentry *old_dentry, struct inode *dir,
> >   	return err;
> >   }
> >   
> > +static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc,
> > +				 struct ceph_mds_request *req)
> > +{
> > +	/* If op failed, set error on parent directory */
> > +	mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > +	if (req->r_err)
> > +		printk("%s: req->r_err = %d\n", __func__, req->r_err);
> > +	ceph_put_cap_refs(ceph_inode(req->r_parent),
> > +			  CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK);
> > +	iput(req->r_old_inode);
> > +}
> > +
> > +static bool get_caps_for_async_unlink(struct inode *dir, struct dentry *dentry)
> > +{
> > +	struct ceph_inode_info *ci = ceph_inode(dir);
> > +	struct ceph_dentry_info *di;
> > +	int ret, want, got;
> > +
> > +	want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_UNLINK;
> > +	ret = ceph_try_get_caps(dir, 0, want, true, &got);
> > +	dout("Fx on %p ret=%d got=%d\n", dir, ret, got);
> > +	if (ret != 1 || got != want)
> > +		return false;
> > +
> > +        spin_lock(&dentry->d_lock);
> > +        di = ceph_dentry(dentry);
> > +	/* - We are holding CEPH_CAP_FILE_EXCL, which implies
> > +	 * CEPH_CAP_FILE_SHARED.
> > +	 * - Only support async unlink for primary linkage */
> > Well, you'd have to fix up the caller in the first patch to pass
> > some argument, and
> > +	if (atomic_read(&ci->i_shared_gen) != di->lease_shared_gen ||
> > +	    !(di->flags & CEPH_DENTRY_PRIMARY_LINK))
> > +		ret = 0;
> > +        spin_unlock(&dentry->d_lock);
> > +
> > +	if (!ret) {
> > +		ceph_put_cap_refs(ci, got);
> > +		return false;
> > +	}
> > +	return true;
> > +}
> > +
> >   /*
> >    * rmdir and unlink are differ only by the metadata op code
> >    */
> > @@ -1067,13 +1108,33 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)
> >   	req->r_dentry = dget(dentry);
> >   	req->r_num_caps = 2;
> >   	req->r_parent = dir;
> > -	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
> >   	req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
> >   	req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> >   	req->r_inode_drop = ceph_drop_caps_for_unlink(inode);
> > -	err = ceph_mdsc_do_request(mdsc, dir, req);
> > -	if (!err && !req->r_reply_info.head->is_dentry)
> > -		d_delete(dentry);
> > +
> > +	if (enable_async_dirops && op == CEPH_MDS_OP_UNLINK &&
> > +	    get_caps_for_async_unlink(dir, dentry)) {
> > +		dout("ceph: Async unlink on %lu/%.*s", dir->i_ino,
> > +		     dentry->d_name.len, dentry->d_name.name);
> > +		req->r_callback = ceph_async_unlink_cb;
> > +		req->r_old_inode = d_inode(dentry);
> > +		ihold(req->r_old_inode);
> > +		err = ceph_mdsc_submit_request(mdsc, dir, req);
> > +		if (!err) {
> > +			/*
> > +			 * We have enough caps, so we assume that the unlink
> > +			 * will succeed. Fix up the target inode and dcache.
> > +			 */
> > +			drop_nlink(inode);
> > +			d_delete(dentry);
> > +		}
> > +	} else {
> > +		set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
> > +		err = ceph_mdsc_do_request(mdsc, dir, req);
> > +		if (!err && !req->r_reply_info.head->is_dentry)
> > +			d_delete(dentry);
> > +	}
> > +
> >   	ceph_mdsc_put_request(req);
> >   out:
> >   	return err;
> > @@ -1411,6 +1472,7 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
> >   	spin_lock(&dentry->d_lock);
> >   	di->time = jiffies;
> >   	di->lease_shared_gen = 0;
> > +	di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
> >   	__dentry_lease_unlist(di);
> >   	spin_unlock(&dentry->d_lock);
> >   }
> > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > index ef9e8281d485..ffef475af72b 100644
> > --- a/fs/ceph/inode.c
> > +++ b/fs/ceph/inode.c
> > @@ -1048,6 +1048,7 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
> >   				  struct ceph_mds_session **old_lease_session)
> >   {
> >   	struct ceph_dentry_info *di = ceph_dentry(dentry);
> > +	unsigned mask = le16_to_cpu(lease->mask);
> >   	long unsigned duration = le32_to_cpu(lease->duration_ms);
> >   	long unsigned ttl = from_time + (duration * HZ) / 1000;
> >   	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
> > @@ -1059,8 +1060,13 @@ static void __update_dentry_lease(struct inode *dir, struct dentry *dentry,
> >   	if (ceph_snap(dir) != CEPH_NOSNAP)
> >   		return;
> >   
> > +	if (mask & CEPH_LEASE_PRIMARY_LINK)
> > +		di->flags |= CEPH_DENTRY_PRIMARY_LINK;
> > +	else
> > +		di->flags &= ~CEPH_DENTRY_PRIMARY_LINK;
> > +
> >   	di->lease_shared_gen = atomic_read(&ceph_inode(dir)->i_shared_gen);
> > -	if (duration == 0) {
> > +	if (!(mask & CEPH_LEASE_VALID)) {
> >   		__ceph_dentry_dir_lease_touch(di);
> >   		return;
> >   	}
> > diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> > index 112927dbd2f2..c149edb6aa2d 100644
> > --- a/fs/ceph/super.c
> > +++ b/fs/ceph/super.c
> > @@ -1324,6 +1324,10 @@ static void __exit exit_ceph(void)
> >   	destroy_caches();
> >   }
> >   
> > +bool enable_async_dirops;
> > +module_param(enable_async_dirops, bool, 0644);
> > +MODULE_PARM_DESC(enable_async_dirops, "Asynchronous directory operations enabled");
> > +
> >   module_init(init_ceph);
> >   module_exit(exit_ceph);
> >   
> > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > index 4b53f32b9324..4083de451710 100644
> > --- a/fs/ceph/super.h
> > +++ b/fs/ceph/super.h
> > @@ -72,6 +72,8 @@
> >   #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
> >   #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
> >   
> > +extern bool enable_async_dirops;
> > +
> >   struct ceph_mount_options {
> >   	unsigned int flags;
> >   
> > @@ -282,6 +284,7 @@ struct ceph_dentry_info {
> >   #define CEPH_DENTRY_REFERENCED		1
> >   #define CEPH_DENTRY_LEASE_LIST		2
> >   #define CEPH_DENTRY_SHRINK_LIST		4
> > +#define CEPH_DENTRY_PRIMARY_LINK	8
> >   
> >   struct ceph_inode_xattrs_info {
> >   	/*
> > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > index cb21c5cf12c3..a099f60feb7b 100644
> > --- a/include/linux/ceph/ceph_fs.h
> > +++ b/include/linux/ceph/ceph_fs.h
> > @@ -530,6 +530,9 @@ struct ceph_mds_reply_lease {
> >   	__le32 seq;
> >   } __attribute__ ((packed));
> >   
> > +#define CEPH_LEASE_VALID        (1 | 2) /* old and new bit values */
> > +#define CEPH_LEASE_PRIMARY_LINK 4       /* primary linkage */
> > +
> >   struct ceph_mds_reply_dirfrag {
> >   	__le32 frag;            /* fragment */
> >   	__le32 auth;            /* auth mds, if this is a delegation point */
> > @@ -659,6 +662,12 @@ int ceph_flags_to_mode(int flags);
> >   #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
> >   			CEPH_LOCK_IXATTR)
> >   
> > +/* cap masks async dir operations */
> > +#define CEPH_CAP_DIR_CREATE	CEPH_CAP_FILE_CACHE
> > +#define CEPH_CAP_DIR_UNLINK	CEPH_CAP_FILE_RD
> > +#define CEPH_CAP_ANY_DIR_OPS	(CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | \
> > +				 CEPH_CAP_FILE_WREXTEND | CEPH_CAP_FILE_LAZYIO)
> > +
> >   int ceph_caps_for_mode(int mode);
> >   
> >   enum {
> > 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>