Re: [RFC PATCH 9/9] ceph: attempt to do async create when possible

Jeff Layton <jlayton@xxxxxxxxxx> · Mon, 13 Jan 2020 10:20:27 -0500

On Mon, 2020-01-13 at 22:48 +0800, Yan, Zheng wrote:
> On 1/13/20 9:44 PM, Jeff Layton wrote:
> > On Mon, 2020-01-13 at 18:53 +0800, Yan, Zheng wrote:
> > > On 1/11/20 4:56 AM, Jeff Layton wrote:
> > > > With the Octopus release, the MDS will hand out directoy create caps.
> > > > If we have Fxc caps on the directory, and complete directory information
> > > > or a known negative dentry, then we can return without waiting on the
> > > > reply, allowing the open() call to return very quickly to userland.
> > > > 
> > > > We use the normal ceph_fill_inode() routine to fill in the inode, so we
> > > > have to gin up some reply inode information with what we'd expect a
> > > > newly-created inode to have. The client assumes that it has a full set
> > > > of caps on the new inode, and that the MDS will revoke them when there
> > > > is conflicting access.
> > > > 
> > > > This functionality is gated on the enable_async_dirops module option,
> > > > along with async unlinks, and on the server supporting the Octopus
> > > > CephFS feature bit.
> > > > 
> > > > Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
> > > > ---
> > > >    fs/ceph/caps.c               |   7 +-
> > > >    fs/ceph/file.c               | 178 +++++++++++++++++++++++++++++++++--
> > > >    fs/ceph/mds_client.c         |  12 ++-
> > > >    fs/ceph/mds_client.h         |   3 +-
> > > >    fs/ceph/super.h              |   2 +
> > > >    include/linux/ceph/ceph_fs.h |   8 +-
> > > >    6 files changed, 191 insertions(+), 19 deletions(-)
> > > > 
> > > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> > > > index b96fb1378479..21a8a2ddc94b 100644
> > > > --- a/fs/ceph/caps.c
> > > > +++ b/fs/ceph/caps.c
> > > > @@ -654,6 +654,10 @@ void ceph_add_cap(struct inode *inode,
> > > >    		session->s_nr_caps++;
> > > >    		spin_unlock(&session->s_cap_lock);
> > > >    	} else {
> > > > +		/* Did an async create race with the reply? */
> > > > +		if (cap_id == CEPH_CAP_ID_TBD && cap->issued == issued)
> > > > +			return;
> > > > +
> > > >    		spin_lock(&session->s_cap_lock);
> > > >    		list_move_tail(&cap->session_caps, &session->s_caps);
> > > >    		spin_unlock(&session->s_cap_lock);
> > > > @@ -672,7 +676,8 @@ void ceph_add_cap(struct inode *inode,
> > > >    		 */
> > > >    		if (ceph_seq_cmp(seq, cap->seq) <= 0) {
> > > >    			WARN_ON(cap != ci->i_auth_cap);
> > > > -			WARN_ON(cap->cap_id != cap_id);
> > > > +			WARN_ON(cap_id != CEPH_CAP_ID_TBD &&
> > > > +				cap->cap_id != cap_id);
> > > >    			seq = cap->seq;
> > > >    			mseq = cap->mseq;
> > > >    			issued |= cap->issued;
> > > > diff --git a/fs/ceph/file.c b/fs/ceph/file.c
> > > > index d4d7a277faf1..706abd71b731 100644
> > > > --- a/fs/ceph/file.c
> > > > +++ b/fs/ceph/file.c
> > > > @@ -450,6 +450,141 @@ copy_file_layout(struct inode *dst, struct inode *src)
> > > >    	spin_unlock(&cdst->i_ceph_lock);
> > > >    }
> > > >    
> > > > +static bool get_caps_for_async_create(struct inode *dir, struct dentry *dentry)
> > > > +{
> > > > +	struct ceph_inode_info *ci = ceph_inode(dir);
> > > > +	int ret, want, got;
> > > > +
> > > > +	/*
> > > > +	 * We can do an async create if we either have a valid negative dentry
> > > > +	 * or the complete contents of the directory. Do a quick check without
> > > > +	 * cap refs.
> > > > +	 */
> > > > +	if ((d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) ||
> > > 
> > > what does (d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) mean?
> > > 
> > 
> > If d_in_lookup returns false, then we have a dentry that is known to be
> > negative (IOW, it passed d_revalidate). If d_in_lookup returns true,
> > then this is the initial lookup for the dentry.
> > 
> > So if that returns true and the directory isn't complete then we can't
> > do an async create since we don't have enough information about the
> > namespace.
> > 
> > That probably deserves a better comment. I'll try to make that clear.
> 
> if directory is not complete and d_in_lookup() return false. we know the 
> dentry is negative in dcache, but it does not guarantee that 
> corresponding dentry in mds is negative. Between d_revalidate and this
> function, mds may reovked dentry's lease and issued Fsx caps

Nothing prevents that occurring while in this function in that case. I
guess then we need to test for an actual dentry lease here then after
ensuring that we have the correct caps?

> > 
> > > I think we can async create if dentry is negative and its
> > > lease_shared_gen == ci->i_shared_gen.
> > > 
> > > > +	    !ceph_file_layout_is_valid(&ci->i_layout))
> > > > +		return false;
> > > > +
> > > > +	/* Try to get caps */
> > > > +	want = CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE;
> > > > +	ret = ceph_try_get_caps(dir, 0, want, true, &got);
> > > > +	dout("Fx on %p ret=%d got=%d\n", dir, ret, got);
> > > > +	if (ret != 1)
> > > > +		return false;
> > > > +	if (got != want) {
> > > > +		ceph_put_cap_refs(ci, got);
> > > > +		return false;
> > > > +	}
> > > > +
> > > > +	/* Check again, now that we hold cap refs */
> > > > +	if ((d_in_lookup(dentry) && !__ceph_dir_is_complete(ci)) ||
> > > > +	    !ceph_file_layout_is_valid(&ci->i_layout)) {
> > > > +		ceph_put_cap_refs(ci, got);
> > > > +		return false;
> > > > +	}
> > > > +
> > > > +	return true;
> > > > +}
> > > > +
> > > > +static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
> > > > +                                 struct ceph_mds_request *req)
> > > > +{
> > > > +	/* If we never sent anything then nothing to clean up */
> > > > +	if (req->r_err == -ECHILD)
> > > > +		goto out;
> > > > +
> > > > +	mapping_set_error(req->r_parent->i_mapping, req->r_err);
> > > > +
> > > > +	if (req->r_target_inode) {
> > > > +		u64 ino = ceph_vino(req->r_target_inode).ino;
> > > > +
> > > > +		if (req->r_deleg_ino != ino)
> > > > +			pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%lx target=0x%llx\n",
> > > > +				__func__, req->r_err, req->r_deleg_ino, ino);
> > > > +		mapping_set_error(req->r_target_inode->i_mapping, req->r_err);
> > > > +	} else {
> > > > +		pr_warn("%s: no req->r_target_inode for 0x%lx\n", __func__,
> > > > +			req->r_deleg_ino);
> > > > +	}
> > > > +out:
> > > > +	ceph_put_cap_refs(ceph_inode(req->r_parent),
> > > > +			  CEPH_CAP_FILE_EXCL | CEPH_CAP_DIR_CREATE);
> > > > +}
> > > > +
> > > > +static int ceph_finish_async_open(struct inode *dir, struct dentry *dentry,
> > > > +				  struct file *file, umode_t mode,
> > > > +				  struct ceph_mds_request *req,
> > > > +				  struct ceph_acl_sec_ctx *as_ctx)
> > > > +{
> > > > +	int ret;
> > > > +	struct ceph_mds_reply_inode in = { };
> > > > +	struct ceph_mds_reply_info_in iinfo = { .in = &in };
> > > > +	struct ceph_inode_info *ci = ceph_inode(dir);
> > > > +	struct inode *inode;
> > > > +	struct timespec64 now;
> > > > +	struct ceph_vino vino = { .ino = req->r_deleg_ino,
> > > > +				  .snap = CEPH_NOSNAP };
> > > > +
> > > > +	ktime_get_real_ts64(&now);
> > > > +
> > > > +	inode = ceph_get_inode(dentry->d_sb, vino);
> > > > +	if (IS_ERR(inode))
> > > > +		return PTR_ERR(inode);
> > > > +
> > > > +	/* If we can't get a buffer, just carry on */
> > > > +	iinfo.xattr_data = kzalloc(4, GFP_NOFS);
> > > > +	if (iinfo.xattr_data)
> > > > +		iinfo.xattr_len = 4;
> > > 
> > > ??
> > > 
> > > I think we should decode req->r_pagelist into xattrs
> > > 
> > > 
> > 
> > I'm not sure I follow what you're suggesting here. At this point,
> > r_pagelist may be set to as_ctx.pagelist. It would be nice to avoid an
> > allocation here though. I guess I could pass in an on-stack buffer. It
> > is only 4 bytes after all.
> > 
> > > > +
> > > > +	iinfo.inline_version = CEPH_INLINE_NONE;
> > > > +	iinfo.change_attr = 1;
> > > > +	ceph_encode_timespec64(&iinfo.btime, &now);
> > > > +
> > > > +	in.ino = cpu_to_le64(vino.ino);
> > > > +	in.snapid = cpu_to_le64(CEPH_NOSNAP);
> > > > +	in.version = cpu_to_le64(1);	// ???
> > > > +	in.cap.caps = in.cap.wanted = cpu_to_le32(CEPH_CAP_ALL_FILE);
> > > > +	in.cap.cap_id = cpu_to_le64(CEPH_CAP_ID_TBD);
> > > > +	in.cap.realm = cpu_to_le64(ci->i_snap_realm->ino);
> > > > +	in.cap.flags = CEPH_CAP_FLAG_AUTH;
> > > > +	in.ctime = in.mtime = in.atime = iinfo.btime;
> > > > +	in.mode = cpu_to_le32((u32)mode);
> > > > +	in.truncate_seq = cpu_to_le32(1);
> > > > +	in.truncate_size = cpu_to_le64(ci->i_truncate_size);
> > > > +	in.max_size = cpu_to_le64(ci->i_max_size);
> > > > +	in.xattr_version = cpu_to_le64(1);
> > > > +	in.uid = cpu_to_le32(from_kuid(&init_user_ns, current_fsuid()));
> > > > +	in.gid = cpu_to_le32(from_kgid(&init_user_ns, current_fsgid()));
> > > 
> > > if dir has S_ISGID, new file's gid should be inherit from dir's gid
> > > 
> > > 
> > 
> > Good catch. I'll fix that up for the next iteration.
> > 
> > > > +	in.nlink = cpu_to_le32(1);
> > > > +
> > > > +	ceph_file_layout_to_legacy(&ci->i_layout, &in.layout);
> > > > +
> > > > +	ret = ceph_fill_inode(inode, NULL, &iinfo, NULL, req->r_session,
> > > > +			      req->r_fmode, NULL);
> > > > +	if (ret) {
> > > > +		dout("%s failed to fill inode: %d\n", __func__, ret);
> > > > +		if (inode->i_state & I_NEW)
> > > > +			discard_new_inode(inode);
> > > > +	} else {
> > > > +		struct dentry *dn;
> > > > +
> > > > +		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
> > > > +			vino.ino, dir->i_ino, dentry->d_name.name);
> > > > +		ceph_dir_clear_ordered(dir);
> > > > +		ceph_init_inode_acls(inode, as_ctx);
> > > > +		if (inode->i_state & I_NEW)
> > > > +			unlock_new_inode(inode);
> > > > +		if (d_in_lookup(dentry) || d_really_is_negative(dentry)) {
> > > > +			if (!d_unhashed(dentry))
> > > > +				d_drop(dentry);
> > > > +			dn = d_splice_alias(inode, dentry);
> > > > +			WARN_ON_ONCE(dn && dn != dentry);
> > > > +		}
> > > > +		file->f_mode |= FMODE_CREATED;
> > > > +		ret = finish_open(file, dentry, ceph_open);
> > > > +	}
> > > > +	return ret;
> > > > +}
> > > > +
> > > >    /*
> > > >     * Do a lookup + open with a single request.  If we get a non-existent
> > > >     * file or symlink, return 1 so the VFS can retry.
> > > > @@ -462,6 +597,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    	struct ceph_mds_request *req;
> > > >    	struct dentry *dn;
> > > >    	struct ceph_acl_sec_ctx as_ctx = {};
> > > > +	bool try_async = enable_async_dirops;
> > > >    	int mask;
> > > >    	int err;
> > > >    
> > > > @@ -486,6 +622,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    		return -ENOENT;
> > > >    	}
> > > >    
> > > > +retry:
> > > >    	/* do the open */
> > > >    	req = prepare_open_request(dir->i_sb, flags, mode);
> > > >    	if (IS_ERR(req)) {
> > > > @@ -494,6 +631,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    	}
> > > >    	req->r_dentry = dget(dentry);
> > > >    	req->r_num_caps = 2;
> > > > +	mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > +	if (ceph_security_xattr_wanted(dir))
> > > > +		mask |= CEPH_CAP_XATTR_SHARED;
> > > > +	req->r_args.open.mask = cpu_to_le32(mask);
> > > > +	req->r_parent = dir;
> > > > +
> > > >    	if (flags & O_CREAT) {
> > > >    		req->r_dentry_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_AUTH_EXCL;
> > > >    		req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
> > > > @@ -501,21 +644,37 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    			req->r_pagelist = as_ctx.pagelist;
> > > >    			as_ctx.pagelist = NULL;
> > > >    		}
> > > > +		if (try_async && get_caps_for_async_create(dir, dentry)) {
> > > > +			set_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags);
> > > > +			req->r_callback = ceph_async_create_cb;
> > > > +			err = ceph_mdsc_submit_request(mdsc, dir, req);
> > > > +			switch (err) {
> > > > +			case 0:
> > > > +				/* set up inode, dentry and return */
> > > > +				err = ceph_finish_async_open(dir, dentry, file,
> > > > +							mode, req, &as_ctx);
> > > > +				goto out_req;
> > > > +			case -ECHILD:
> > > > +				/* do a sync create */
> > > > +				try_async = false;
> > > > +				as_ctx.pagelist = req->r_pagelist;
> > > > +				req->r_pagelist = NULL;
> > > > +				ceph_mdsc_put_request(req);
> > > > +				goto retry;
> > > > +			default:
> > > > +				/* Hard error, give up */
> > > > +				goto out_req;
> > > > +			}
> > > > +		}
> > > >    	}
> > > >    
> > > > -       mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> > > > -       if (ceph_security_xattr_wanted(dir))
> > > > -               mask |= CEPH_CAP_XATTR_SHARED;
> > > > -       req->r_args.open.mask = cpu_to_le32(mask);
> > > > -
> > > > -	req->r_parent = dir;
> > > >    	set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
> > > >    	err = ceph_mdsc_do_request(mdsc,
> > > >    				   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
> > > >    				   req);
> > > >    	err = ceph_handle_snapdir(req, dentry, err);
> > > >    	if (err)
> > > > -		goto out_req;
> > > > +		goto out_fmode;
> > > >    
> > > >    	if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
> > > >    		err = ceph_handle_notrace_create(dir, dentry);
> > > > @@ -529,7 +688,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    		dn = NULL;
> > > >    	}
> > > >    	if (err)
> > > > -		goto out_req;
> > > > +		goto out_fmode;
> > > >    	if (dn || d_really_is_negative(dentry) || d_is_symlink(dentry)) {
> > > >    		/* make vfs retry on splice, ENOENT, or symlink */
> > > >    		dout("atomic_open finish_no_open on dn %p\n", dn);
> > > > @@ -545,9 +704,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
> > > >    		}
> > > >    		err = finish_open(file, dentry, ceph_open);
> > > >    	}
> > > > -out_req:
> > > > +out_fmode:
> > > >    	if (!req->r_err && req->r_target_inode)
> > > >    		ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode);
> > > > +out_req:
> > > >    	ceph_mdsc_put_request(req);
> > > >    out_ctx:
> > > >    	ceph_release_acl_sec_ctx(&as_ctx);
> > > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > > index 9e7492b21b50..c76d6e7f8136 100644
> > > > --- a/fs/ceph/mds_client.c
> > > > +++ b/fs/ceph/mds_client.c
> > > > @@ -2620,14 +2620,16 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
> > > >    		flags |= CEPH_MDS_FLAG_REPLAY;
> > > >    	if (req->r_parent)
> > > >    		flags |= CEPH_MDS_FLAG_WANT_DENTRY;
> > > > -	rhead->flags = cpu_to_le32(flags);
> > > > -	rhead->num_fwd = req->r_num_fwd;
> > > > -	rhead->num_retry = req->r_attempts - 1;
> > > > -	if (test_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags))
> > > > +	if (test_bit(CEPH_MDS_R_DELEG_INO, &req->r_req_flags)) {
> > > >    		rhead->ino = cpu_to_le64(req->r_deleg_ino);
> > > > -	else
> > > > +		flags |= CEPH_MDS_FLAG_ASYNC;
> > > > +	} else {
> > > >    		rhead->ino = 0;
> > > > +	}
> > > >    
> > > > +	rhead->flags = cpu_to_le32(flags);
> > > > +	rhead->num_fwd = req->r_num_fwd;
> > > > +	rhead->num_retry = req->r_attempts - 1;
> > > >    	dout(" r_parent = %p\n", req->r_parent);
> > > >    	return 0;
> > > >    }
> > > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > > index e0b36be7c44f..49e6cd5a07a2 100644
> > > > --- a/fs/ceph/mds_client.h
> > > > +++ b/fs/ceph/mds_client.h
> > > > @@ -39,8 +39,7 @@ enum ceph_feature_type {
> > > >    	CEPHFS_FEATURE_REPLY_ENCODING,		\
> > > >    	CEPHFS_FEATURE_LAZY_CAP_WANTED,		\
> > > >    	CEPHFS_FEATURE_MULTI_RECONNECT,		\
> > > > -						\
> > > > -	CEPHFS_FEATURE_MAX,			\
> > > > +	CEPHFS_FEATURE_OCTOPUS,			\
> > > >    }
> > > >    #define CEPHFS_FEATURES_CLIENT_REQUIRED {}
> > > >    
> > > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > > index ec4d66d7c261..33e03fbba888 100644
> > > > --- a/fs/ceph/super.h
> > > > +++ b/fs/ceph/super.h
> > > > @@ -136,6 +136,8 @@ struct ceph_fs_client {
> > > >    #endif
> > > >    };
> > > >    
> > > > +/* Special placeholder value for a cap_id during an asynchronous create. */
> > > > +#define        CEPH_CAP_ID_TBD         -1ULL
> > > >    
> > > >    /*
> > > >     * File i/o capability.  This tracks shared state with the metadata
> > > > diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
> > > > index a099f60feb7b..b127563e21a1 100644
> > > > --- a/include/linux/ceph/ceph_fs.h
> > > > +++ b/include/linux/ceph/ceph_fs.h
> > > > @@ -444,8 +444,9 @@ union ceph_mds_request_args {
> > > >    	} __attribute__ ((packed)) lookupino;
> > > >    } __attribute__ ((packed));
> > > >    
> > > > -#define CEPH_MDS_FLAG_REPLAY        1  /* this is a replayed op */
> > > > -#define CEPH_MDS_FLAG_WANT_DENTRY   2  /* want dentry in reply */
> > > > +#define CEPH_MDS_FLAG_REPLAY		1  /* this is a replayed op */
> > > > +#define CEPH_MDS_FLAG_WANT_DENTRY	2  /* want dentry in reply */
> > > > +#define CEPH_MDS_FLAG_ASYNC		4  /* request is asynchronous */
> > > >    
> > > >    struct ceph_mds_request_head {
> > > >    	__le64 oldest_client_tid;
> > > > @@ -658,6 +659,9 @@ int ceph_flags_to_mode(int flags);
> > > >    #define CEPH_CAP_ANY      (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
> > > >    			   CEPH_CAP_ANY_FILE_WR | CEPH_CAP_FILE_LAZYIO | \
> > > >    			   CEPH_CAP_PIN)
> > > > +#define CEPH_CAP_ALL_FILE (CEPH_CAP_PIN | CEPH_CAP_ANY_SHARED | \
> > > > +			   CEPH_CAP_AUTH_EXCL | CEPH_CAP_XATTR_EXCL | \
> > > > +			   CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)
> > > >    
> > > >    #define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
> > > >    			CEPH_LOCK_IXATTR)
> > > > 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>