Patch "ceph: flush the mdlog before waiting on unsafe reqs" has been added to the 5.10-stable tree

Sasha Levin <sashal@xxxxxxxxxx> · Sun, 27 Nov 2022 20:23:33 -0500

This is a note to let you know that I've just added the patch titled

    ceph: flush the mdlog before waiting on unsafe reqs

to the 5.10-stable tree which can be found at:
    http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary

The filename of the patch is:
     ceph-flush-the-mdlog-before-waiting-on-unsafe-reqs.patch
and it can be found in the queue-5.10 subdirectory.

If you, or anyone else, feels it should not be added to the stable tree,
please let <stable@xxxxxxxxxxxxxxx> know about it.



commit f8cdd34ab2e440220cabbe9bed7a4cad936379b7
Author: Xiubo Li <xiubli@xxxxxxxxxx>
Date:   Mon Jul 5 09:22:57 2021 +0800

    ceph: flush the mdlog before waiting on unsafe reqs
    
    [ Upstream commit e1a4541ec0b951685a49d1f72d183681e6433a45 ]
    
    For the client requests who will have unsafe and safe replies from
    MDS daemons, in the MDS side the MDS daemons won't flush the mdlog
    (journal log) immediatelly, because they think it's unnecessary.
    That's true for most cases but not all, likes the fsync request.
    The fsync will wait until all the unsafe replied requests to be
    safely replied.
    
    Normally if there have multiple threads or clients are running, the
    whole mdlog in MDS daemons could be flushed in time if any request
    will trigger the mdlog submit thread. So usually we won't experience
    the normal operations will stuck for a long time. But in case there
    has only one client with only thread is running, the stuck phenomenon
    maybe obvious and the worst case it must wait at most 5 seconds to
    wait the mdlog to be flushed by the MDS's tick thread periodically.
    
    This patch will trigger to flush the mdlog in the relevant and auth
    MDSes to which the in-flight requests are sent just before waiting
    the unsafe requests to finish.
    
    Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
    Reviewed-by: Jeff Layton <jlayton@xxxxxxxxxx>
    Signed-off-by: Ilya Dryomov <idryomov@xxxxxxxxx>
    Stable-dep-of: 5bd76b8de5b7 ("ceph: fix NULL pointer dereference for req->r_session")
    Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 7ae27a18cf18..2fa6b7cc0cc4 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2294,6 +2294,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
  */
 static int unsafe_request_wait(struct inode *inode)
 {
+	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
 	int ret, err = 0;
@@ -2313,6 +2314,81 @@ static int unsafe_request_wait(struct inode *inode)
 	}
 	spin_unlock(&ci->i_unsafe_lock);
 
+	/*
+	 * Trigger to flush the journal logs in all the relevant MDSes
+	 * manually, or in the worst case we must wait at most 5 seconds
+	 * to wait the journal logs to be flushed by the MDSes periodically.
+	 */
+	if (req1 || req2) {
+		struct ceph_mds_session **sessions = NULL;
+		struct ceph_mds_session *s;
+		struct ceph_mds_request *req;
+		unsigned int max;
+		int i;
+
+		/*
+		 * The mdsc->max_sessions is unlikely to be changed
+		 * mostly, here we will retry it by reallocating the
+		 * sessions arrary memory to get rid of the mdsc->mutex
+		 * lock.
+		 */
+retry:
+		max = mdsc->max_sessions;
+		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
+		if (!sessions)
+			return -ENOMEM;
+
+		spin_lock(&ci->i_unsafe_lock);
+		if (req1) {
+			list_for_each_entry(req, &ci->i_unsafe_dirops,
+					    r_unsafe_dir_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		if (req2) {
+			list_for_each_entry(req, &ci->i_unsafe_iops,
+					    r_unsafe_target_item) {
+				s = req->r_session;
+				if (unlikely(s->s_mds > max)) {
+					spin_unlock(&ci->i_unsafe_lock);
+					goto retry;
+				}
+				if (!sessions[s->s_mds]) {
+					s = ceph_get_mds_session(s);
+					sessions[s->s_mds] = s;
+				}
+			}
+		}
+		spin_unlock(&ci->i_unsafe_lock);
+
+		/* the auth MDS */
+		spin_lock(&ci->i_ceph_lock);
+		if (ci->i_auth_cap) {
+		      s = ci->i_auth_cap->session;
+		      if (!sessions[s->s_mds])
+			      sessions[s->s_mds] = ceph_get_mds_session(s);
+		}
+		spin_unlock(&ci->i_ceph_lock);
+
+		/* send flush mdlog request to MDSes */
+		for (i = 0; i < max; i++) {
+			s = sessions[i];
+			if (s) {
+				send_flush_mdlog(s);
+				ceph_put_mds_session(s);
+			}
+		}
+		kfree(sessions);
+	}
+
 	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
 	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
 	if (req1) {