Hi, zheng yan. I've just modified the code to solve the issue that you mentioned the other day, please take a look:-) thanks On Mon, 25 Feb 2019 at 20:29, <xxhdx1985126@xxxxxxxxx> wrote: > > From: Xuehan Xu <xuxuehan@xxxxxx> > > Instead of issue a new getattr/lookup req to MDSes for each getattr/lookup > op, issue a new one if there is no inflight req that requires that same caps > as the current getattr/lookup op. > > Signed-off-by: Xuehan Xu <xuxuehan@xxxxxx> > --- > fs/ceph/caps.c | 1 + > fs/ceph/dir.c | 92 ++++++++++++++++++++--------- > fs/ceph/inode.c | 55 +++++++++++++---- > fs/ceph/mds_client.c | 30 +++++++++- > fs/ceph/mds_client.h | 15 ++++- > fs/ceph/super.c | 136 +++++++++++++++++++++++++++++++++++++++++++ > fs/ceph/super.h | 14 +++++ > 7 files changed, 304 insertions(+), 39 deletions(-) > > diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c > index bba28a5034ba..5a177a3c283f 100644 > --- a/fs/ceph/caps.c > +++ b/fs/ceph/caps.c > @@ -3360,6 +3360,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, > goto out; > > ci->i_flushing_caps &= ~cleaned; > + ci->last_flush_ack_tid = flush_tid; > > spin_lock(&mdsc->cap_dirty_lock); > > diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c > index 82928cea0209..6f17c0319158 100644 > --- a/fs/ceph/dir.c > +++ b/fs/ceph/dir.c > @@ -1180,6 +1180,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, > } > } > } > + dout("dentry_lease_is_valid ttl = %ld, ceph_dentry.time = %ld, lease_renew_after = %ld, lease_renew_from = %ld, jiffies = %ld\n", ttl, di->time, di->lease_renew_after, di->lease_renew_from, jiffies); > } > spin_unlock(&dentry->d_lock); > > @@ -1188,7 +1189,7 @@ static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags, > CEPH_MDS_LEASE_RENEW, seq); > ceph_put_mds_session(session); > } > - dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid); > + dout("dentry_lease_is_valid - di %p, dentry %p = %d\n", di, dentry, valid); > return valid; > } > > @@ -1256,46 +1257,85 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) > if (!valid) { > struct ceph_mds_client *mdsc = > ceph_sb_to_client(dir->i_sb)->mdsc; > - struct ceph_mds_request *req; > + struct ceph_mds_request *req = NULL; > + struct ceph_inode_info* cdir = ceph_inode(dir); > int op, err; > u32 mask; > > if (flags & LOOKUP_RCU) > return -ECHILD; > + mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > + if (ceph_security_xattr_wanted(dir)) > + mask |= CEPH_CAP_XATTR_SHARED; > > op = ceph_snap(dir) == CEPH_SNAPDIR ? > CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; > - req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); > - if (!IS_ERR(req)) { > - req->r_dentry = dget(dentry); > - req->r_num_caps = 2; > - req->r_parent = dir; > + if (op == CEPH_MDS_OP_LOOKUP) { > + mutex_lock(&cdir->lookups_inflight_lock); > + dout("d_revalidate searching inode lookups inflight, %p, '%pd', inode %p offset %lld, mask: %d\n", > + dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask); > + req = __search_inode_getattr_or_lookup(cdir, mask, true); > + } > + if (req && op == CEPH_MDS_OP_LOOKUP) { > + dout("d_revalidate found previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", > + dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started); > + ceph_mdsc_get_request(req); > + mutex_unlock(&cdir->lookups_inflight_lock); > + err = ceph_mdsc_wait_for_request(req); > + dout("d_revalidate waited previous lookup inflight, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", > + dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err); > + } else { > > - mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED; > - if (ceph_security_xattr_wanted(dir)) > - mask |= CEPH_CAP_XATTR_SHARED; > - req->r_args.getattr.mask = cpu_to_le32(mask); > + req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS); > + if (op == CEPH_MDS_OP_LOOKUP) { > + if (!IS_ERR(req)) { > + unsigned long res = 0; > + req->r_dentry = dget(dentry); > + req->last_caps_flush_ack_tid = cdir->last_flush_ack_tid; > + req->r_num_caps = 2; > + req->r_parent = dir; > + req->r_args.getattr.mask = cpu_to_le32(mask); > + res = __register_inode_getattr_or_lookup(cdir, req, true); > + if (IS_ERR_VALUE(res)) { > + mutex_unlock(&cdir->lookups_inflight_lock); > + goto out; > + } > + dout("d_revalidate no previous lookup inflight, just registered a new one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld\n", > + dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started); > + } > + mutex_unlock(&cdir->lookups_inflight_lock); > + } > + if (IS_ERR(req)) > + goto out; > > err = ceph_mdsc_do_request(mdsc, NULL, req); > - switch (err) { > - case 0: > - if (d_really_is_positive(dentry) && > - d_inode(dentry) == req->r_target_inode) > - valid = 1; > - break; > - case -ENOENT: > - if (d_really_is_negative(dentry)) > - valid = 1; > - /* Fallthrough */ > - default: > - break; > + if (op == CEPH_MDS_OP_LOOKUP) { > + mutex_lock(&cdir->lookups_inflight_lock); > + __unregister_inode_getattr_or_lookup(cdir, req, true); > + dout("d_revalidate just unregistered one, %p, '%pd', inode %p offset %lld, mask: %d, req jiffies: %ld, err: %d\n", > + dentry, dentry, d_inode(dentry), ceph_dentry(dentry)->offset, mask, req->r_started, err); > + mutex_unlock(&cdir->lookups_inflight_lock); > } > - ceph_mdsc_put_request(req); > - dout("d_revalidate %p lookup result=%d\n", > - dentry, err); > } > + switch (err) { > + case 0: > + if (d_really_is_positive(dentry) && > + d_inode(dentry) == req->r_target_inode) > + valid = 1; > + break; > + case -ENOENT: > + if (d_really_is_negative(dentry)) > + valid = 1; > + /* Fallthrough */ > + default: > + break; > + } > + ceph_mdsc_put_request(req); > + dout("d_revalidate %p lookup result=%d\n", > + dentry, err); > } > > +out: > dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); > if (valid) { > ceph_dentry_lru_touch(dentry); > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c > index 9d1f34d46627..8705f0645a24 100644 > --- a/fs/ceph/inode.c > +++ b/fs/ceph/inode.c > @@ -430,6 +430,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) > dout("alloc_inode %p\n", &ci->vfs_inode); > > spin_lock_init(&ci->i_ceph_lock); > + mutex_init(&ci->getattrs_inflight_lock); > + mutex_init(&ci->lookups_inflight_lock); > > ci->i_version = 0; > ci->i_inline_version = 0; > @@ -461,9 +463,12 @@ struct inode *ceph_alloc_inode(struct super_block *sb) > ci->i_xattrs.index_version = 0; > > ci->i_caps = RB_ROOT; > + ci->getattrs_inflight = RB_ROOT; > + ci->lookups_inflight = RB_ROOT; > ci->i_auth_cap = NULL; > ci->i_dirty_caps = 0; > ci->i_flushing_caps = 0; > + ci->last_flush_ack_tid = 0; > INIT_LIST_HEAD(&ci->i_dirty_item); > INIT_LIST_HEAD(&ci->i_flushing_item); > ci->i_prealloc_cap_flush = NULL; > @@ -1044,9 +1049,10 @@ static void update_dentry_lease(struct dentry *dentry, > * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that > * we expect a negative dentry. > */ > + dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode); > if (!tgt_vino && d_really_is_positive(dentry)) > return; > - > + dout("update_dentry_lease, d_inode: %p\n", dentry->d_inode); > if (tgt_vino && (d_really_is_negative(dentry) || > !ceph_ino_compare(d_inode(dentry), tgt_vino))) > return; > @@ -2188,6 +2194,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, > struct ceph_mds_request *req; > int mode; > int err; > + struct ceph_inode_info* cinode = ceph_inode(inode); > > if (ceph_snap(inode) == CEPH_SNAPDIR) { > dout("do_getattr inode %p SNAPDIR\n", inode); > @@ -2199,16 +2206,42 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page, > if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) > return 0; > > - mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; > - req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); > - if (IS_ERR(req)) > - return PTR_ERR(req); > - req->r_inode = inode; > - ihold(inode); > - req->r_num_caps = 1; > - req->r_args.getattr.mask = cpu_to_le32(mask); > - req->r_locked_page = locked_page; > - err = ceph_mdsc_do_request(mdsc, NULL, req); > + mutex_lock(&cinode->getattrs_inflight_lock); > + dout("__ceph_do_getattr searching inode getattrs inflight, inode %p, mask: %d\n", inode, mask); > + req = __search_inode_getattr_or_lookup(cinode, mask, false); > + if (req) { > + dout("__ceph_do_getattr found previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started); > + ceph_mdsc_get_request(req); > + mutex_unlock(&cinode->getattrs_inflight_lock); > + err = ceph_mdsc_wait_for_request(req); > + dout("__ceph_do_getattr waited previous inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err); > + } else { > + mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS; > + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode); > + if (!IS_ERR(req)) { > + unsigned long res = 0; > + req->r_inode = inode; > + req->last_caps_flush_ack_tid = cinode->last_flush_ack_tid; > + ihold(inode); > + req->r_num_caps = 1; > + req->r_args.getattr.mask = cpu_to_le32(mask); > + req->r_locked_page = locked_page; > + res = __register_inode_getattr_or_lookup(cinode, req, false); > + if (IS_ERR_VALUE(res)) { > + mutex_unlock(&cinode->getattrs_inflight_lock); > + return res; > + } > + dout("__ceph_do_getattr no previous getattr inflight, inode %p, mask: %d, req jiffies: %ld\n", inode, mask, req->r_started); > + } > + mutex_unlock(&cinode->getattrs_inflight_lock); > + if (IS_ERR(req)) > + return PTR_ERR(req); > + err = ceph_mdsc_do_request(mdsc, NULL, req); > + mutex_lock(&cinode->getattrs_inflight_lock); > + __unregister_inode_getattr_or_lookup(cinode, req, false); > + dout("__ceph_do_getattr just unregistered inode getattr inflight, inode %p, mask: %d, req jiffies: %ld, err: %d\n", inode, mask, req->r_started, err); > + mutex_unlock(&cinode->getattrs_inflight_lock); > + } > if (locked_page && err == 0) { > u64 inline_version = req->r_reply_info.targeti.inline_version; > if (inline_version == 0) { > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index 163fc74bf221..5c6e00f2fd0a 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -1815,6 +1815,14 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, > return 0; > } > > +void ceph_mds_requests_tree_release(struct kref* kref) > +{ > + struct mds_request_tree* mrtree = container_of(kref, struct mds_request_tree, ref); > + > + BUG_ON(!RB_EMPTY_ROOT(&mrtree->mds_requests)); > + kfree(mrtree); > +} > + > /* > * Create an mds request. > */ > @@ -1836,7 +1844,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) > req->r_fmode = -1; > kref_init(&req->r_kref); > RB_CLEAR_NODE(&req->r_node); > + RB_CLEAR_NODE(&req->getattr_lookup_node); > INIT_LIST_HEAD(&req->r_wait); > + init_completion(&req->batch_op_completion); > init_completion(&req->r_completion); > init_completion(&req->r_safe_completion); > INIT_LIST_HEAD(&req->r_unsafe_item); > @@ -2429,6 +2439,23 @@ void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, > mutex_unlock(&mdsc->mutex); > } > > +int ceph_mdsc_wait_for_request(struct ceph_mds_request* req) > +{ > + int err = 0; > + long timeleft = wait_for_completion_killable_timeout( > + &req->batch_op_completion, > + ceph_timeout_jiffies(req->r_timeout)); > + if (timeleft > 0) > + err = 0; > + else if (!timeleft) > + err = -EIO; /* timed out */ > + else > + err = timeleft; /* killed */ > + if (!err) > + return err; > + return req->batch_op_err; > +} > + > /* > * Synchrously perform an mds request. Take care of all of the > * session setup, forwarding, retry details. > @@ -2501,7 +2528,8 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, > } else { > err = req->r_err; > } > - > + req->batch_op_err = err; > + complete_all(&req->batch_op_completion); > out: > mutex_unlock(&mdsc->mutex); > dout("do_request %p done, result %d\n", req, err); > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 729da155ebf0..41727677643e 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -207,12 +207,23 @@ typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc, > typedef int (*ceph_mds_request_wait_callback_t) (struct ceph_mds_client *mdsc, > struct ceph_mds_request *req); > > +struct mds_request_tree { > + int mask; > + struct kref ref; > + struct rb_root mds_requests; > + struct rb_node parent_tree_node; > +}; > + > +extern void ceph_mds_requests_tree_release(struct kref* ref); > + > /* > * an in-flight mds request > */ > struct ceph_mds_request { > u64 r_tid; /* transaction id */ > + u64 last_caps_flush_ack_tid; > struct rb_node r_node; > + struct rb_node getattr_lookup_node; > struct ceph_mds_client *r_mdsc; > > int r_op; /* mds op code */ > @@ -264,7 +275,7 @@ struct ceph_mds_request { > struct ceph_msg *r_reply; > struct ceph_mds_reply_info_parsed r_reply_info; > struct page *r_locked_page; > - int r_err; > + int r_err, batch_op_err; > > unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ > unsigned long r_started; /* start time to measure timeout against */ > @@ -287,6 +298,7 @@ struct ceph_mds_request { > > struct kref r_kref; > struct list_head r_wait; > + struct completion batch_op_completion; > struct completion r_completion; > struct completion r_safe_completion; > ceph_mds_request_callback_t r_callback; > @@ -425,6 +437,7 @@ extern struct ceph_mds_request * > ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode); > extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, > struct ceph_mds_request *req); > +extern int ceph_mdsc_wait_for_request(struct ceph_mds_request* req); > extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, > struct inode *dir, > struct ceph_mds_request *req); > diff --git a/fs/ceph/super.c b/fs/ceph/super.c > index da2cd8e89062..857fbbc0b768 100644 > --- a/fs/ceph/super.c > +++ b/fs/ceph/super.c > @@ -1177,6 +1177,142 @@ static void __exit exit_ceph(void) > destroy_caches(); > } > > +void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* inode, > + struct ceph_mds_request* req, > + bool is_lookup) > +{ > + struct rb_node *node = NULL; > + struct mds_request_tree* tmp = NULL; > + if (!is_lookup) > + node = inode->getattrs_inflight.rb_node; /* top of the tree */ > + else > + node = inode->lookups_inflight.rb_node; > + > + while (node) > + { > + tmp = rb_entry(node, struct mds_request_tree, parent_tree_node); > + > + if (tmp->mask > req->r_args.getattr.mask) > + node = node->rb_left; > + else if (tmp->mask < req->r_args.getattr.mask) > + node = node->rb_right; > + else > + break; > + } > + > + if (!node) > + BUG(); > + > + rb_erase(&req->getattr_lookup_node, &tmp->mds_requests); > + > + if (RB_EMPTY_ROOT(&tmp->mds_requests)) { > + if (!is_lookup) > + rb_erase(node, &inode->getattrs_inflight); > + else > + rb_erase(node, &inode->lookups_inflight); > + kref_put(&tmp->ref, ceph_mds_requests_tree_release); > + } > +} > + > +int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci, > + struct ceph_mds_request* req, > + bool is_lookup) > +{ > + struct rb_node **p = NULL, *parent = NULL; > + struct mds_request_tree *tmp = NULL; > + struct ceph_mds_request* tmp_req = NULL; > + struct mds_request_tree* mrtree = NULL; > + > + if (!is_lookup) > + p = &ci->getattrs_inflight.rb_node; > + else > + p = &ci->lookups_inflight.rb_node; > + > + while (*p) { > + parent = *p; > + tmp = rb_entry(parent, struct mds_request_tree, parent_tree_node); > + if (req->r_args.getattr.mask < tmp->mask) > + p = &(*p)->rb_left; > + else if (req->r_args.getattr.mask > tmp->mask) > + p = &(*p)->rb_right; > + else > + break; > + } > + > + if (!(*p)) { > + mrtree = kzalloc(sizeof(struct mds_request_tree), GFP_NOFS); > + > + if (!mrtree) > + return -ENOMEM; > + > + mrtree->mds_requests = RB_ROOT; > + mrtree->mask = req->r_args.getattr.mask; > + kref_init(&mrtree->ref); > + rb_link_node(&mrtree->parent_tree_node, parent, p); > + if (!is_lookup) { > + rb_insert_color(&mrtree->parent_tree_node, &ci->getattrs_inflight); > + } else { > + rb_insert_color(&mrtree->parent_tree_node, &ci->lookups_inflight); > + } > + } else { > + mrtree = rb_entry((*p), struct mds_request_tree, parent_tree_node); > + } > + > + p = &mrtree->mds_requests.rb_node; > + parent = NULL; > + while (*p) { > + parent = *p; > + tmp_req = rb_entry(parent, struct ceph_mds_request, getattr_lookup_node); > + if (req->last_caps_flush_ack_tid < tmp_req->last_caps_flush_ack_tid) > + p = &(*p)->rb_left; > + else if (req->last_caps_flush_ack_tid > tmp_req->last_caps_flush_ack_tid) > + p = &(*p)->rb_right; > + else > + BUG(); > + } > + > + rb_link_node(&req->getattr_lookup_node, parent, p); > + rb_insert_color(&req->getattr_lookup_node, &mrtree->mds_requests); > + return 0; > +} > + > +struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode, > + int mask, > + bool is_lookup) > +{ > + struct rb_node *node = NULL; > + if (!is_lookup) > + node = inode->getattrs_inflight.rb_node; /* top of the tree */ > + else > + node = inode->lookups_inflight.rb_node; > + > + while (node) > + { > + struct mds_request_tree* tmp = NULL; > + tmp = rb_entry(node, struct mds_request_tree, parent_tree_node); > + > + if (tmp->mask > mask) { > + node = node->rb_left; > + } else if (tmp->mask < mask) { > + node = node->rb_right; > + } else { > + struct rb_node* inner_node = tmp->mds_requests.rb_node; > + while (inner_node) { > + struct ceph_mds_request* req = NULL; > + > + req = rb_entry(inner_node, struct ceph_mds_request, getattr_lookup_node); > + if (req->last_caps_flush_ack_tid > inode->last_flush_ack_tid) { > + inner_node = inner_node->rb_left; > + } else if (req->last_caps_flush_ack_tid < inode->last_flush_ack_tid) { > + inner_node = inner_node->rb_right; > + } else > + return req; /* Found it */ > + } > + return NULL; > + } > + } > + return NULL; > +} > module_init(init_ceph); > module_exit(exit_ceph); > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h > index dfb64a5211b6..abf761f2a122 100644 > --- a/fs/ceph/super.h > +++ b/fs/ceph/super.h > @@ -296,6 +296,8 @@ struct ceph_inode_info { > struct ceph_vino i_vino; /* ceph ino + snap */ > > spinlock_t i_ceph_lock; > + struct mutex getattrs_inflight_lock, lookups_inflight_lock; > + struct rb_root getattrs_inflight, lookups_inflight; > > u64 i_version; > u64 i_inline_version; > @@ -329,6 +331,7 @@ struct ceph_inode_info { > struct rb_root i_caps; /* cap list */ > struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ > unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ > + u64 last_flush_ack_tid; > struct list_head i_dirty_item, i_flushing_item; > /* we need to track cap writeback on a per-cap-bit basis, to allow > * overlapping, pipelined cap flushes to the mds. we can probably > @@ -864,6 +867,17 @@ extern void ceph_fill_file_time(struct inode *inode, int issued, > u64 time_warp_seq, struct timespec64 *ctime, > struct timespec64 *mtime, > struct timespec64 *atime); > +extern int __register_inode_getattr_or_lookup(struct ceph_inode_info* ci, > + struct ceph_mds_request* req, > + bool is_lookup); > + > +extern void __unregister_inode_getattr_or_lookup(struct ceph_inode_info* ci, > + struct ceph_mds_request* req, > + bool is_lookup); > + > +extern struct ceph_mds_request* __search_inode_getattr_or_lookup(struct ceph_inode_info* inode, > + int mask, > + bool is_lookup); > extern int ceph_fill_trace(struct super_block *sb, > struct ceph_mds_request *req); > extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, > -- > 2.19.1 >