Re: [PATCH] ceph: blocklist the kclient when receiving corrupt snap trace

Aaron Tomlin <atomlin@xxxxxxxxxx> · Tue, 6 Dec 2022 10:32:31 +0000

On Mon 2022-12-05 14:15 +0800, xiubli@xxxxxxxxxx wrote:
> From: Xiubo Li <xiubli@xxxxxxxxxx>
> 
> When received corrupted snap trace we don't know what exactly has
> happened in MDS side. And we shouldn't continue writing to OSD,
> which may corrupt the snapshot contents.
> 
> Just try to blocklist this client and If fails we need to crash the
> client instead of leaving it writeable to OSDs.
> 
> URL: https://tracker.ceph.com/issues/57686
> Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
> ---
>  fs/ceph/mds_client.c |  3 ++-
>  fs/ceph/mds_client.h |  1 +
>  fs/ceph/snap.c       | 25 +++++++++++++++++++++++++
>  3 files changed, 28 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index cbbaf334b6b8..59094944af28 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con)
>  	struct ceph_mds_client *mdsc = s->s_mdsc;
>  
>  	pr_warn("mds%d closed our session\n", s->s_mds);
> -	send_mds_reconnect(mdsc, s);
> +	if (!mdsc->no_reconnect)
> +		send_mds_reconnect(mdsc, s);
>  }
>  
>  static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 728b7d72bf76..8e8f0447c0ad 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -413,6 +413,7 @@ struct ceph_mds_client {
>  	atomic_t		num_sessions;
>  	int                     max_sessions;  /* len of sessions array */
>  	int                     stopping;      /* true if shutting down */
> +	int                     no_reconnect;  /* true if snap trace is corrupted */
>  
>  	atomic64_t		quotarealms_count; /* # realms with quota */
>  	/*
> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> index c1c452afa84d..5b211ec7d7f7 100644
> --- a/fs/ceph/snap.c
> +++ b/fs/ceph/snap.c
> @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>  	struct ceph_snap_realm *realm;
>  	struct ceph_snap_realm *first_realm = NULL;
>  	struct ceph_snap_realm *realm_to_rebuild = NULL;
> +	struct ceph_client *client = mdsc->fsc->client;
>  	int rebuild_snapcs;
>  	int err = -ENOMEM;
> +	int ret;
>  	LIST_HEAD(dirty_realms);
>  
>  	lockdep_assert_held_write(&mdsc->snap_rwsem);
> @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
>  	if (first_realm)
>  		ceph_put_snap_realm(mdsc, first_realm);
>  	pr_err("%s error %d\n", __func__, err);
> +
> +	/*
> +	 * When received corrupted snap trace we don't know what
> +	 * exactly has happened in MDS side. And we shouldn't continue
> +	 * writing to OSD, which may corrupt the snapshot contents.
> +	 *
> +	 * Just try to blocklist this client and if fails we need to
> +	 * crash the client instead of leaving it writeable to OSDs.
> +	 *
> +	 * Then this kclient must be remounted to continue after the
> +	 * corrupted metadata fixed in MDS side.
> +	 */
> +	mdsc->no_reconnect = 1;
> +	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
> +	if (ret) {
> +		pr_err("%s blocklist of %s failed: %d", __func__,
> +		       ceph_pr_addr(&client->msgr.inst.addr), ret);
> +		BUG();
> +	}
> +	pr_err("%s %s was blocklisted, do remount to continue%s",
> +	       __func__, ceph_pr_addr(&client->msgr.inst.addr),
> +	       err == -EIO ? " after corrupted snaptrace fixed": "");
> +
>  	return err;
>  }

Hi Xiubo,

How about using a WARN() so that we taint the Linux kernel too:

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 864cdaa0d2bd..1941584b8fdb 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	struct ceph_snap_realm *realm = NULL;
 	struct ceph_snap_realm *first_realm = NULL;
 	struct ceph_snap_realm *realm_to_rebuild = NULL;
+	struct ceph_client *client = mdsc->fsc->client;
 	int rebuild_snapcs;
 	int err = -ENOMEM;
+	int ret;
 	LIST_HEAD(dirty_realms);
 
 	lockdep_assert_held_write(&mdsc->snap_rwsem);
@@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
 	if (first_realm)
 		ceph_put_snap_realm(mdsc, first_realm);
 	pr_err("%s error %d\n", __func__, err);
+
+
+	/*
+	 * When received corrupted snap trace we don't know what
+	 * exactly has happened in MDS side. And we shouldn't continue
+	 * writing to OSD, which may corrupt the snapshot contents.
+	 *
+	 * Just try to blocklist this client and if fails we need to
+	 * crash the client instead of leaving it writeable to OSDs.
+	 *
+	 * Then this kclient must be remounted to continue after the
+	 * corrupted metadata fixed in MDS side.
+	 */
+	mdsc->no_reconnect = 1;
+	ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr);
+	if (ret) {
+		pr_err("%s blocklist of %s failed: %d", __func__,
+		       ceph_pr_addr(&client->msgr.inst.addr), ret);
+		BUG();
+	}
+
+	WARN(1, "%s %s was blocklisted, do remount to continue%s",
+	     __func__, ceph_pr_addr(&client->msgr.inst.addr),
+	     err == -EIO ? " after corrupted snaptrace fixed": "");
+
 	return err;
 }


-- 
Aaron Tomlin