Re: [PATCH 1/2] ceph: issue getattr/lookup reqs to MDSes in an aggregative pattern

Jeff Layton <jlayton@xxxxxxxxxx> · Tue, 26 Feb 2019 18:54:23 -0500

On Mon, 2019-02-25 at 20:29 +0800, xxhdx1985126@xxxxxxxxx wrote:
> From: Xuehan Xu <xuxuehan@xxxxxx>
> 
> Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup
> op, issue a new one if there is no inflight req that requires that same caps
> as the current getattr/lookup op.
> 
> Signed-off-by: Xuehan Xu <xuxuehan@xxxxxx>
> ---
>  fs/ceph/caps.c       |   1 +
>  fs/ceph/dir.c        |  92 ++++++++++++++++++++---------
>  fs/ceph/inode.c      |  55 +++++++++++++----
>  fs/ceph/mds_client.c |  30 +++++++++-
>  fs/ceph/mds_client.h |  15 ++++-
>  fs/ceph/super.c      | 136 +++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/super.h      |  14 +++++
>  7 files changed, 304 insertions(+), 39 deletions(-)
> 

The description on this patch could be fleshed out a bit.

How much does this help performance and on what workloads? This is a lot
of code to add without much justification.

Looking over the patch, the basic idea is to implement a cache (in
rbtrees) of getattrs and lookups in flight and just wait on those if
there already is one that matches what we need. I agree with Zheng that
this could end up violating the strict cache coherency guarantees that
ceph tries to provide.

You're relying on an assumption that if you issue a getattr/lookup while
another one is already in flight, then it's legitimate to use the result
of the initial one for the one issued later. That's not necessarily the
case. I suspect this would be possible, even from a single client:

- client1/task1 issues stat(), server performs getattr
but reply gets delayed for a bit

- client2 does write() + fsync()

- client1/task2 issues stat(), finds the entry in cache from task1 and
ends up waiting for first getattr to complete

- getattr reply comes in, and both stat calls get back the same
attributes, showing pre-write size/mtime

In this case, the synchronous write by client2 was provably done before
the second getattr, but the second stat() call wouldn't see the new
size.

The read handling patch may suffer from similar issues, though I haven't
gone over that one in as much detail.

> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index bba28a5034ba..5a177a3c283f 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
>  		goto out;
>  
>  	ci->i_flushing_caps &= ~cleaned;
> +	ci->last_flush_ack_tid = flush_tid;
>  
>  	spin_lock(&mdsc->cap_dirty_lock);
>  
> diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
> index 82928cea0209..6f17c0319158 100644
> --- a/fs/ceph/dir.c
> +++ b/fs/ceph/dir.c
> @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>  				}
>  			}
>  		}
> +		dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies);
>  	}
>  	spin_unlock(&dentry->d_lock);
>  
> @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
>  					 CEPH_MDS_LEASE_RENEW, seq);
>  		ceph_put_mds_session(session);
>  	}
> -	dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
> +	dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid);
>  	return valid;
>  }
>  
> @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
>  	if (!valid) {
>  		struct ceph_mds_client *mdsc =
>  			ceph_sb_to_client(dir->i_sb)->mdsc;
> -		struct ceph_mds_request *req;
> +		struct ceph_mds_request *req = NULL;
> +		struct ceph_inode_info* cdir = ceph_inode(dir);
>  		int op, err;
>  		u32 mask;
>  
>  		if (flags & LOOKUP_RCU)
>  			return -ECHILD;
> +		mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> +		if (ceph_security_xattr_wanted(dir))
> +			mask |= CEPH_CAP_XATTR_SHARED;
>  
>  		op = ceph_snap(dir) == CEPH_SNAPDIR ?
>  			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
> -		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> -		if (!IS_ERR(req)) {
> -			req->r_dentry = dget(dentry);
> -			req->r_num_caps = 2;
> -			req->r_parent = dir;
> +		if (op == CEPH_MDS_OP_LOOKUP) {
> +			mutex_lock(&cdir->lookups_inflight_lock);
> +			dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n", 
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask);
> +			req = __search_inode_getattr_or_lookup(cdir, mask, true);
> +		}
> +		if (req && op == CEPH_MDS_OP_LOOKUP) {
> +			dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +			ceph_mdsc_get_request(req);
> +			mutex_unlock(&cdir->lookups_inflight_lock);
> +			err = ceph_mdsc_wait_for_request(req);
> +			dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n",
> +					dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +		} else {
>  
> -			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
> -			if (ceph_security_xattr_wanted(dir))
> -				mask |= CEPH_CAP_XATTR_SHARED;
> -			req->r_args.getattr.mask = cpu_to_le32(mask);
> +			req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
> +			if (op == CEPH_MDS_OP_LOOKUP) {
> +				if (!IS_ERR(req)) {
> +					unsigned long res = 0;
> +					req->r_dentry = dget(dentry);
> +					req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid;
> +					req->r_num_caps = 2;
> +					req->r_parent = dir;
> +					req->r_args.getattr.mask = cpu_to_le32(mask);
> +					res = __register_inode_getattr_or_lookup(cdir, req, true);
> +					if (IS_ERR_VALUE(res)) {
> +						mutex_unlock(&cdir->lookups_inflight_lock);
> +						goto out;
> +					}
> +					dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", 
> +							dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started);
> +				}
> +				mutex_unlock(&cdir->lookups_inflight_lock);
> +			}
> +			if (IS_ERR(req))
> +				goto out;
>  
>  			err = ceph_mdsc_do_request(mdsc, NULL, req);
> -			switch (err) {
> -			case 0:
> -				if (d_really_is_positive(dentry) &&
> -				    d_inode(dentry) == req->r_target_inode)
> -					valid = 1;
> -				break;
> -			case -ENOENT:
> -				if (d_really_is_negative(dentry))
> -					valid = 1;
> -				/* Fallthrough */
> -			default:
> -				break;
> +			if (op == CEPH_MDS_OP_LOOKUP) {
> +				mutex_lock(&cdir->lookups_inflight_lock);
> +				__unregister_inode_getattr_or_lookup(cdir, req, true);
> +				dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", 
> +						dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err);
> +				mutex_unlock(&cdir->lookups_inflight_lock);
>  			}
> -			ceph_mdsc_put_request(req);
> -			dout("d_revalidate %p lookup result=%d\n",
> -			     dentry, err);
>  		}
> +		switch (err) {
> +		case 0:
> +			if (d_really_is_positive(dentry) &&
> +			    d_inode(dentry) == req->r_target_inode)
> +				valid = 1;
> +			break;
> +		case -ENOENT:
> +			if (d_really_is_negative(dentry))
> +				valid = 1;
> +			/* Fallthrough */
> +		default:
> +			break;
> +		}
> +		ceph_mdsc_put_request(req);
> +		dout("d_revalidate %p lookup result=%d\n",
> +   				dentry, err);
>  	}
>  
> +out:
>  	dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
>  	if (valid) {
>  		ceph_dentry_lru_touch(dentry);

d_revalidate is changed to use the cache, but not lookup. Why?

> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index 9d1f34d46627..8705f0645a24 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	dout("alloc_inode %p\n", &ci->vfs_inode);
>  
>  	spin_lock_init(&ci->i_ceph_lock);
> +	mutex_init(&ci->getattrs_inflight_lock);
> +	mutex_init(&ci->lookups_inflight_lock);
>  
>  	ci->i_version = 0;
>  	ci->i_inline_version = 0;
> @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
>  	ci->i_xattrs.index_version = 0;
>  
>  	ci->i_caps = RB_ROOT;
> +	ci->getattrs_inflight = RB_ROOT;
> +	ci->lookups_inflight = RB_ROOT;
>  	ci->i_auth_cap = NULL;
>  	ci->i_dirty_caps = 0;
>  	ci->i_flushing_caps = 0;
> +	ci->last_flush_ack_tid = 0;
>  	INIT_LIST_HEAD(&ci->i_dirty_item);
>  	INIT_LIST_HEAD(&ci->i_flushing_item);
>  	ci->i_prealloc_cap_flush = NULL;
> @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry,
>  	 * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
>  	 * we expect a negative dentry.
>  	 */
> +	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>  	if (!tgt_vino && d_really_is_positive(dentry))
>  		return;
> -
> +	dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode);
>  	if (tgt_vino && (d_really_is_negative(dentry) ||
>  			!ceph_ino_compare(d_inode(dentry), tgt_vino)))
>  		return;
> @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>  	struct ceph_mds_request *req;
>  	int mode;
>  	int err;
> +	struct ceph_inode_info* cinode = ceph_inode(inode);
>  
>  	if (ceph_snap(inode) == CEPH_SNAPDIR) {
>  		dout("do_getattr inode %p SNAPDIR\n", inode);
> @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
>  	if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
>  		return 0;
>  
> -	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> -	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> -	if (IS_ERR(req))
> -		return PTR_ERR(req);
> -	req->r_inode = inode;
> -	ihold(inode);
> -	req->r_num_caps = 1;
> -	req->r_args.getattr.mask = cpu_to_le32(mask);
> -	req->r_locked_page = locked_page;
> -	err = ceph_mdsc_do_request(mdsc, NULL, req);
> +	mutex_lock(&cinode->getattrs_inflight_lock);
> +	dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask);
> +	req = __search_inode_getattr_or_lookup(cinode, mask, false);
> +	if (req) {
> +		dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +		ceph_mdsc_get_request(req);
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +		err = ceph_mdsc_wait_for_request(req);
> +		dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +	} else {
> +		mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
> +		req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
> +		if (!IS_ERR(req)) {
> +			unsigned long res = 0;
> +			req->r_inode = inode;
> +			req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid;
> +			ihold(inode);
> +			req->r_num_caps = 1;
> +			req->r_args.getattr.mask = cpu_to_le32(mask);
> +			req->r_locked_page = locked_page;
> +			res = __register_inode_getattr_or_lookup(cinode, req, false);
> +			if (IS_ERR_VALUE(res)) {
> +				mutex_unlock(&cinode->getattrs_inflight_lock);
> +				return res;
> +			}
> +			dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started);
> +		}
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +		if (IS_ERR(req))
> +			return PTR_ERR(req);
> +		err = ceph_mdsc_do_request(mdsc, NULL, req);
> +		mutex_lock(&cinode->getattrs_inflight_lock);
> +		__unregister_inode_getattr_or_lookup(cinode, req, false);
> +		dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err);
> +		mutex_unlock(&cinode->getattrs_inflight_lock);
> +	}
>  	if (locked_page && err == 0) {
>  		u64 inline_version = req->r_reply_info.targeti.inline_version;
>  		if (inline_version == 0) {
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 163fc74bf221..5c6e00f2fd0a 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
>  	return 0;
>  }
>  
> +void ceph_mds_requests_tree_release(struct kref* kref)
> +{
> +	struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref);
> +
> +	BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests));
> +	kfree(mrtree);
> +}
> +
>  /*
>   * Create an mds request.
>   */
> @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
>  	req->r_fmode = -1;
>  	kref_init(&req->r_kref);
>  	RB_CLEAR_NODE(&req->r_node);
> +	RB_CLEAR_NODE(&req->getattr_lookup_node);
>  	INIT_LIST_HEAD(&req->r_wait);
> +	init_completion(&req->batch_op_completion);
>  	init_completion(&req->r_completion);
>  	init_completion(&req->r_safe_completion);
>  	INIT_LIST_HEAD(&req->r_unsafe_item);
> @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>  	mutex_unlock(&mdsc->mutex);
>  }
>  
> +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req)
> +{
> +	int err = 0;
> +	long timeleft = wait_for_completion_killable_timeout(
> +			&req->batch_op_completion,
> +			ceph_timeout_jiffies(req->r_timeout));
> +	if (timeleft > 0)
> +		err = 0;
> +	else if (!timeleft)
> +		err = -EIO;  /* timed out */
> +	else
> +		err = timeleft;  /* killed */
> +	if (!err)
> +		return err;
> +	return req->batch_op_err;
> +}
> +
>  /*
>   * Synchrously perform an mds request.  Take care of all of the
>   * session setup, forwarding, retry details.
> @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>  	} else {
>  		err = req->r_err;
>  	}
> -
> +	req->batch_op_err = err;
> +	complete_all(&req->batch_op_completion);
>  out:
>  	mutex_unlock(&mdsc->mutex);
>  	dout("do_request %p done, result %d\n", req, err);
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 729da155ebf0..41727677643e 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
>  typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc,
>  						 struct ceph_mds_request *req);
>  
> +struct mds_request_tree {
> +	int mask;
> +	struct kref ref;
> +	struct rb_root mds_requests;
> +	struct rb_node parent_tree_node;
> +};
> +
> +extern void ceph_mds_requests_tree_release(struct kref* ref);
> +
>  /*
>   * an in-flight mds request
>   */
>  struct ceph_mds_request {
>  	u64 r_tid;                   /* transaction id */
> +	u64 last_caps_flush_ack_tid;
>  	struct rb_node r_node;
> +	struct rb_node getattr_lookup_node;
>  	struct ceph_mds_client *r_mdsc;
>  
>  	int r_op;                    /* mds op code */
> @@ -264,7 +275,7 @@ struct ceph_mds_request {
>  	struct ceph_msg  *r_reply;
>  	struct ceph_mds_reply_info_parsed r_reply_info;
>  	struct page *r_locked_page;
> -	int r_err;
> +	int r_err, batch_op_err;
>  
>  	unsigned long r_timeout;  /* optional.  jiffies, 0 is "wait forever" */
>  	unsigned long r_started;  /* start time to measure timeout against */
> @@ -287,6 +298,7 @@ struct ceph_mds_request {
>  
>  	struct kref       r_kref;
>  	struct list_head  r_wait;
> +	struct completion batch_op_completion;
>  	struct completion r_completion;
>  	struct completion r_safe_completion;
>  	ceph_mds_request_callback_t r_callback;
> @@ -425,6 +437,7 @@ extern struct ceph_mds_request *
>  ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
>  extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
>  				     struct ceph_mds_request *req);
> +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req);
>  extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
>  				struct inode *dir,
>  				struct ceph_mds_request *req);
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index da2cd8e89062..857fbbc0b768 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void)
>  	destroy_caches();
>  }
>  
> +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +					  struct ceph_mds_request* req,
> +					  bool is_lookup)
> +{
> +	struct rb_node *node = NULL;
> +	struct mds_request_tree* tmp = NULL;
> +	if (!is_lookup)
> +		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +	else
> +		node = inode->lookups_inflight.rb_node;
> +	
> +	while (node)
> +	{
> +		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +		
> +		if (tmp->mask > req->r_args.getattr.mask)
> +			node = node->rb_left;
> +		else if (tmp->mask < req->r_args.getattr.mask)
> +			node = node->rb_right;
> +		else
> +			break;
> +	}
> +
> +	if (!node)
> +		BUG();
> +
> +	rb_erase(&req->getattr_lookup_node, &tmp->mds_requests);
> +	
> +	if (RB_EMPTY_ROOT(&tmp->mds_requests)) {
> +		if (!is_lookup)
> +			rb_erase(node, &inode->getattrs_inflight);
> +		else
> +			rb_erase(node, &inode->lookups_inflight);
> +		kref_put(&tmp->ref, ceph_mds_requests_tree_release);
> +	}
> +}
> +
> +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +					struct ceph_mds_request* req,
> +					bool is_lookup)
> +{
> +	struct rb_node **p = NULL, *parent = NULL;
> +	struct mds_request_tree *tmp = NULL;
> +	struct ceph_mds_request* tmp_req = NULL;
> +	struct mds_request_tree* mrtree = NULL;
> +
> +	if (!is_lookup)
> +		p = &ci->getattrs_inflight.rb_node;
> +	else
> +		p = &ci->lookups_inflight.rb_node;
> +	
> +	while (*p) {
> +		parent = *p;
> +		tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node);
> +		if (req->r_args.getattr.mask < tmp->mask)
> +			p = &(*p)->rb_left;
> +		else if (req->r_args.getattr.mask > tmp->mask)
> +			p = &(*p)->rb_right;
> +		else
> +			break;
> +	}
> +	
> +	if (!(*p)) {
> +		mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS);
> +
> +		if (!mrtree)
> +			return -ENOMEM;
> +
> +		mrtree->mds_requests = RB_ROOT;
> +		mrtree->mask = req->r_args.getattr.mask;
> +		kref_init(&mrtree->ref);
> +		rb_link_node(&mrtree->parent_tree_node, parent, p);
> +		if (!is_lookup) {
> +			rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight);
> +		} else {
> +			rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight);
> +		}
> +	} else {
> +		mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node);
> +	}
> +
> +	p = &mrtree->mds_requests.rb_node;
> +	parent = NULL;
> +	while (*p) {
> +		parent = *p;
> +		tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node);
> +		if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid)
> +			p = &(*p)->rb_left;
> +		else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid)
> +			p = &(*p)->rb_right;
> +		else
> +			BUG();
> +	}
> +
> +	rb_link_node(&req->getattr_lookup_node, parent, p);
> +	rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests);
> +	return 0;
> +}
> +
> +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +				      int mask,
> +				      bool is_lookup)
> +{
> +	struct rb_node *node = NULL;
> +	if (!is_lookup)
> +		node = inode->getattrs_inflight.rb_node;  /* top of the tree */
> +	else
> +		node = inode->lookups_inflight.rb_node;
> +	
> +	while (node)
> +	{
> +		struct mds_request_tree* tmp = NULL;
> +		tmp = rb_entry(node, struct mds_request_tree, parent_tree_node);
> +		
> +		if (tmp->mask > mask) {
> +			node = node->rb_left;
> +		} else if (tmp->mask < mask) {
> +			node = node->rb_right;
> +		} else {
> +			struct rb_node* inner_node = tmp->mds_requests.rb_node;
> +			while (inner_node) {
> +				struct ceph_mds_request* req = NULL;
> +
> +				req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node);
> +				if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) {
> +					inner_node = inner_node->rb_left;
> +				} else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) {
> +					inner_node = inner_node->rb_right;
> +				} else
> +					return req; /* Found it */
> +			}
> +			return NULL;
> +		}
> +	}
> +	return NULL;
> +}
>  module_init(init_ceph);
>  module_exit(exit_ceph);
>  
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index dfb64a5211b6..abf761f2a122 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -296,6 +296,8 @@ struct ceph_inode_info {
>  	struct ceph_vino i_vino;   /* ceph ino + snap */
>  
>  	spinlock_t i_ceph_lock;
> +	struct mutex getattrs_inflight_lock, lookups_inflight_lock;
> +	struct rb_root getattrs_inflight, lookups_inflight;

If you do end up pursuing this further, I think it might be better to
use a spinlock to protect the trees.

Take the lock, search the tree and if there isn't one, drop the lock,
allocate, take it and search again, and insert the one you allocated if
no conflicting structure was inserted in the meantime.

In addition to cutting down on context switches when these things are
contended, that would also make it easier to hide the locking inside the
cache search/register/unregister routines.

>  
>  	u64 i_version;
>  	u64 i_inline_version;
> @@ -329,6 +331,7 @@ struct ceph_inode_info {
>  	struct rb_root i_caps;           /* cap list */
>  	struct ceph_cap *i_auth_cap;     /* authoritative cap, if any */
>  	unsigned i_dirty_caps, i_flushing_caps;     /* mask of dirtied fields */
> +	u64 last_flush_ack_tid;
>  	struct list_head i_dirty_item, i_flushing_item;
>  	/* we need to track cap writeback on a per-cap-bit basis, to allow
>  	 * overlapping, pipelined cap flushes to the mds.  we can probably
> @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued,
>  				u64 time_warp_seq, struct timespec64 *ctime,
>  				struct timespec64 *mtime,
>  				struct timespec64 *atime);
> +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +					       struct ceph_mds_request* req,
> +					       bool is_lookup);
> +
> +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci,
> +						 struct ceph_mds_request* req,
> +						 bool is_lookup);
> +
> +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode,
> +					     int mask,
> +					     bool is_lookup);
>  extern int ceph_fill_trace(struct super_block *sb,
>  			   struct ceph_mds_request *req);
>  extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,

-- 
Jeff Layton <jlayton@xxxxxxxxxx>