On Mon 2022-12-05 14:15 +0800, xiubli@xxxxxxxxxx wrote: > From: Xiubo Li <xiubli@xxxxxxxxxx> > > When received corrupted snap trace we don't know what exactly has > happened in MDS side. And we shouldn't continue writing to OSD, > which may corrupt the snapshot contents. > > Just try to blocklist this client and If fails we need to crash the > client instead of leaving it writeable to OSDs. > > URL: https://tracker.ceph.com/issues/57686 > Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx> > --- > fs/ceph/mds_client.c | 3 ++- > fs/ceph/mds_client.h | 1 + > fs/ceph/snap.c | 25 +++++++++++++++++++++++++ > 3 files changed, 28 insertions(+), 1 deletion(-) > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c > index cbbaf334b6b8..59094944af28 100644 > --- a/fs/ceph/mds_client.c > +++ b/fs/ceph/mds_client.c > @@ -5648,7 +5648,8 @@ static void mds_peer_reset(struct ceph_connection *con) > struct ceph_mds_client *mdsc = s->s_mdsc; > > pr_warn("mds%d closed our session\n", s->s_mds); > - send_mds_reconnect(mdsc, s); > + if (!mdsc->no_reconnect) > + send_mds_reconnect(mdsc, s); > } > > static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg) > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h > index 728b7d72bf76..8e8f0447c0ad 100644 > --- a/fs/ceph/mds_client.h > +++ b/fs/ceph/mds_client.h > @@ -413,6 +413,7 @@ struct ceph_mds_client { > atomic_t num_sessions; > int max_sessions; /* len of sessions array */ > int stopping; /* true if shutting down */ > + int no_reconnect; /* true if snap trace is corrupted */ > > atomic64_t quotarealms_count; /* # realms with quota */ > /* > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c > index c1c452afa84d..5b211ec7d7f7 100644 > --- a/fs/ceph/snap.c > +++ b/fs/ceph/snap.c > @@ -767,8 +767,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > struct ceph_snap_realm *realm; > struct ceph_snap_realm *first_realm = NULL; > struct ceph_snap_realm *realm_to_rebuild = NULL; > + struct ceph_client *client = mdsc->fsc->client; > int rebuild_snapcs; > int err = -ENOMEM; > + int ret; > LIST_HEAD(dirty_realms); > > lockdep_assert_held_write(&mdsc->snap_rwsem); > @@ -885,6 +887,29 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, > if (first_realm) > ceph_put_snap_realm(mdsc, first_realm); > pr_err("%s error %d\n", __func__, err); > + > + /* > + * When received corrupted snap trace we don't know what > + * exactly has happened in MDS side. And we shouldn't continue > + * writing to OSD, which may corrupt the snapshot contents. > + * > + * Just try to blocklist this client and if fails we need to > + * crash the client instead of leaving it writeable to OSDs. > + * > + * Then this kclient must be remounted to continue after the > + * corrupted metadata fixed in MDS side. > + */ > + mdsc->no_reconnect = 1; > + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); > + if (ret) { > + pr_err("%s blocklist of %s failed: %d", __func__, > + ceph_pr_addr(&client->msgr.inst.addr), ret); > + BUG(); > + } > + pr_err("%s %s was blocklisted, do remount to continue%s", > + __func__, ceph_pr_addr(&client->msgr.inst.addr), > + err == -EIO ? " after corrupted snaptrace fixed": ""); > + > return err; > } Hi Xiubo, How about using a WARN() so that we taint the Linux kernel too: diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 864cdaa0d2bd..1941584b8fdb 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -766,8 +766,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, struct ceph_snap_realm *realm = NULL; struct ceph_snap_realm *first_realm = NULL; struct ceph_snap_realm *realm_to_rebuild = NULL; + struct ceph_client *client = mdsc->fsc->client; int rebuild_snapcs; int err = -ENOMEM; + int ret; LIST_HEAD(dirty_realms); lockdep_assert_held_write(&mdsc->snap_rwsem); @@ -883,6 +885,31 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc, if (first_realm) ceph_put_snap_realm(mdsc, first_realm); pr_err("%s error %d\n", __func__, err); + + + /* + * When received corrupted snap trace we don't know what + * exactly has happened in MDS side. And we shouldn't continue + * writing to OSD, which may corrupt the snapshot contents. + * + * Just try to blocklist this client and if fails we need to + * crash the client instead of leaving it writeable to OSDs. + * + * Then this kclient must be remounted to continue after the + * corrupted metadata fixed in MDS side. + */ + mdsc->no_reconnect = 1; + ret = ceph_monc_blocklist_add(&client->monc, &client->msgr.inst.addr); + if (ret) { + pr_err("%s blocklist of %s failed: %d", __func__, + ceph_pr_addr(&client->msgr.inst.addr), ret); + BUG(); + } + + WARN(1, "%s %s was blocklisted, do remount to continue%s", + __func__, ceph_pr_addr(&client->msgr.inst.addr), + err == -EIO ? " after corrupted snaptrace fixed": ""); + return err; } -- Aaron Tomlin