Re: [PATCH] ceph: show more detail logs during mount

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 2020/2/24 3:18, Jeff Layton wrote:
On Sun, 2020-02-23 at 07:18 -0500, xiubli@xxxxxxxxxx wrote:
From: Xiubo Li <xiubli@xxxxxxxxxx>

Return -ETIMEDOUT when the requests are timed out instead of -EIO
to make it cleaner for the userland. And just print the logs in
error level to give a helpful hint.

URL: https://tracker.ceph.com/issues/44215
Signed-off-by: Xiubo Li <xiubli@xxxxxxxxxx>
---
  fs/ceph/mds_client.c   |  4 ++--
  fs/ceph/super.c        | 28 ++++++++++++++++++++--------
  net/ceph/ceph_common.c |  7 +++++--
  net/ceph/mon_client.c  |  1 +
  4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 82f63ef2694c..0dfea8cdb50a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -2578,7 +2578,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
  	if (req->r_timeout &&
  	    time_after_eq(jiffies, req->r_started + req->r_timeout)) {
  		dout("do_request timed out\n");
-		err = -EIO;
+		err = -ETIMEDOUT;
  		goto finish;
  	}
  	if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) {
@@ -2752,7 +2752,7 @@ static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
  		if (timeleft > 0)
  			err = 0;
  		else if (!timeleft)
-			err = -EIO;  /* timed out */
+			err = -ETIMEDOUT;  /* timed out */
  		else
  			err = timeleft;  /* killed */
  	}
I'm fine with the two hunks above.

Note that AFAICT, r_timeout is only set at mount time, so we should
never see this in other codepaths. This probably ought to be done in a
separate patch from the rest.
Sure, will split it.
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 31acb4fe1f2c..6778f2a7d6d4 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -849,11 +849,13 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
  {
  	struct ceph_mds_client *mdsc = fsc->mdsc;
  	struct ceph_mds_request *req = NULL;
+	struct ceph_mds_session *session;
  	int err;
  	struct dentry *root;
+	char buf[32] = {0};
/* open dir */
-	dout("open_root_inode opening '%s'\n", path);
+	dout("mount open_root_inode opening '%s'\n", path);
  	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
  	if (IS_ERR(req))
  		return ERR_CAST(req);
@@ -873,18 +875,26 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
  	if (err == 0) {
  		struct inode *inode = req->r_target_inode;
  		req->r_target_inode = NULL;
-		dout("open_root_inode success\n");
  		root = d_make_root(inode);
  		if (!root) {
  			root = ERR_PTR(-ENOMEM);
  			goto out;
  		}
-		dout("open_root_inode success, root dentry is %p\n", root);
+		dout(" root dentry is %p\n", root);
  	} else {
  		root = ERR_PTR(err);
  	}
  out:
+	session = ceph_get_mds_session(req->r_session);
+	if (session)
+		snprintf(buf, 32, " on mds%d", session->s_mds);
+
Where is this new session reference put? I'm not sure I understand this
change.

Ah okay, forgot to add the put.

Usually the mount request will do in one specified MDS, here will show in which MDS it was trying to. Will it make sense ?

  	ceph_mdsc_put_request(req);
+	if (!IS_ERR(root))
+		dout("mount open_root_inode success%s\n", buf[0] ? buf : "");
+	else
+		pr_err("mount open_root_inode fail %ld%s\n", PTR_ERR(root),
+		       buf[0] ? buf : "");
  	return root;
  }
This goes to the generic kernel log. This may warrant an error message
there, but it needs to be something a typical user would understand.

Will fix it.

@@ -937,6 +947,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
out:
  	mutex_unlock(&fsc->client->mount_mutex);
+	pr_err("mount fail\n");
This is not something we'd want to add here.
Okay, will delete it.
  	return ERR_PTR(err);
  }
@@ -1028,7 +1039,7 @@ static int ceph_get_tree(struct fs_context *fc)
  		ceph_compare_super;
  	int err;
- dout("ceph_get_tree\n");
+	dout("ceph_get_tree start\n");
if (!fc->source)
  		return invalfc(fc, "No source");
@@ -1073,14 +1084,15 @@ static int ceph_get_tree(struct fs_context *fc)
  		err = PTR_ERR(res);
  		goto out_splat;
  	}
-	dout("root %p inode %p ino %llx.%llx\n", res,
-	     d_inode(res), ceph_vinop(d_inode(res)));
+        dout(" root %p inode %p ino %llx.%llx\n",
+	     res, d_inode(res), ceph_vinop(d_inode(res)));
  	fc->root = fsc->sb->s_root;
+	dout("ceph_get_tree success\n");
  	return 0;
out_splat:
  	if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
-		pr_info("No mds server is up or the cluster is laggy\n");
+		pr_err("No mds server is up or the cluster is laggy\n");
		err = -EHOSTUNREACH;
Is pr_err the right infolevel here? It may be, I'm just curious why the
change after making this pr_info before.

This should be as an error IMO, before we may just want an hint but ignore the level stuff.


  	}
@@ -1091,7 +1103,7 @@ static int ceph_get_tree(struct fs_context *fc)
  out:
  	destroy_fs_client(fsc);
  out_final:
-	dout("ceph_get_tree fail %d\n", err);
+	pr_err("ceph_get_tree fail %d\n", err);
This doesn't seem to be something we want to pr_err.

  	return err;
  }
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index a0e97f6c1072..5971a815fb8e 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -700,11 +700,14 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
  		return err;
while (!have_mon_and_osd_map(client)) {
-		if (timeout && time_after_eq(jiffies, started + timeout))
+		if (timeout && time_after_eq(jiffies, started + timeout)) {
+			pr_err("mount wating for mon/osd maps timed out on mon%d\n",
+			       client->monc.cur_mon);
  			return -ETIMEDOUT;
+		}

This code is also called from RBD codepaths and those don't necessarily
involve a "mount". You should consider how to make the new message more
generic, or maybe handle it at a higher level, so that rbd isn't
affected.

Okay, actually the RBD is also in the 'mount' codepath here.


  		/* wait */
-		dout("mount waiting for mon_map\n");
+		dout("mount waiting for mon/osd maps\n");
  		err = wait_event_interruptible_timeout(client->auth_wq,
  			have_mon_and_osd_map(client) || (client->auth_err < 0),
  			ceph_timeout_jiffies(timeout));

diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 9d9e4e4ea600..8f09df9c3aee 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -1179,6 +1179,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
if (ret < 0) {
  		monc->client->auth_err = ret;
+		pr_err("mon%d session auth failed %d\n", monc->cur_mon, ret);
  	} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
  		dout("authenticated, starting session\n");
Again, these need to be human-readable, and not just debugging messages
that are hard to disable.

Okay, will fix it.

Thanks.

BRs






[Index of Archives]     [CEPH Users]     [Ceph Large]     [Ceph Dev]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]

  Powered by Linux