Re: [PATCH] libceph: Complete stuck requests to OSD with EIO

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Feb 10, 2017 at 1:25 PM, Ilya Dryomov <idryomov@xxxxxxxxx> wrote:
> On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
>> On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote:
>>> From: Artur Molchanov <artur.molchanov@xxxxxxxxxx>
>>>
>>> Complete stuck requests to OSD with error EIO after osd_request_timeout expired.
>>> If osd_request_timeout equals to 0 (default value) then do nothing with
>>> hung requests (keep default behavior).
>>>
>>> Create RBD map option osd_request_timeout to set timeout in seconds. Set
>>> osd_request_timeout to 0 by default.
>>>
>> Also, what exactly are the requests blocked on when this occurs? Is the
>> ceph_osd_request_target ending up paused? I wonder if we might be better
>> off with something that returns a hard error under the circumstances
>> where you're hanging, rather than depending on timeouts.
>>
>>
>>> Signed-off-by: Artur Molchanov <artur.molchanov@xxxxxxxxxx>
>>> ---
>>>   include/linux/ceph/libceph.h    |  8 +++--
>>>   include/linux/ceph/osd_client.h |  1 +
>>>   net/ceph/ceph_common.c          | 12 +++++++
>>>   net/ceph/osd_client.c           | 72 +++++++++++++++++++++++++++++++++++++++++
>>>   4 files changed, 90 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
>>> index 1816c5e..10d8acb 100644
>>> --- a/include/linux/ceph/libceph.h
>>> +++ b/include/linux/ceph/libceph.h
>>> @@ -48,6 +48,7 @@ struct ceph_options {
>>>       unsigned long mount_timeout;            /* jiffies */
>>>       unsigned long osd_idle_ttl;             /* jiffies */
>>>       unsigned long osd_keepalive_timeout;    /* jiffies */
>>> +     unsigned long osd_request_timeout;      /* jiffies */
>>>
>>>       /*
>>>        * any type that can't be simply compared or doesn't need need
>>> @@ -65,9 +66,10 @@ struct ceph_options {
>>>   /*
>>>    * defaults
>>>    */
>>> -#define CEPH_MOUNT_TIMEOUT_DEFAULT   msecs_to_jiffies(60 * 1000)
>>> -#define CEPH_OSD_KEEPALIVE_DEFAULT   msecs_to_jiffies(5 * 1000)
>>> -#define CEPH_OSD_IDLE_TTL_DEFAULT    msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_MOUNT_TIMEOUT_DEFAULT           msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_KEEPALIVE_DEFAULT           msecs_to_jiffies(5 * 1000)
>>> +#define CEPH_OSD_IDLE_TTL_DEFAULT            msecs_to_jiffies(60 * 1000)
>>> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT     msecs_to_jiffies(0 * 1000)
>>>
>>>   #define CEPH_MONC_HUNT_INTERVAL             msecs_to_jiffies(3 * 1000)
>>>   #define CEPH_MONC_PING_INTERVAL             msecs_to_jiffies(10 * 1000)
>>> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
>>> index 03a6653..e45cba0 100644
>>> --- a/include/linux/ceph/osd_client.h
>>> +++ b/include/linux/ceph/osd_client.h
>>> @@ -279,6 +279,7 @@ struct ceph_osd_client {
>>>       atomic_t               num_homeless;
>>>       struct delayed_work    timeout_work;
>>>       struct delayed_work    osds_timeout_work;
>>> +     struct delayed_work    osd_request_timeout_work;
>>>   #ifdef CONFIG_DEBUG_FS
>>>       struct dentry          *debugfs_file;
>>>   #endif
>>> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
>>> index 464e885..81876c8 100644
>>> --- a/net/ceph/ceph_common.c
>>> +++ b/net/ceph/ceph_common.c
>>> @@ -230,6 +230,7 @@ enum {
>>>       Opt_osdkeepalivetimeout,
>>>       Opt_mount_timeout,
>>>       Opt_osd_idle_ttl,
>>> +     Opt_osd_request_timeout,
>>>       Opt_last_int,
>>>       /* int args above */
>>>       Opt_fsid,
>>> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
>>>       {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
>>>       {Opt_mount_timeout, "mount_timeout=%d"},
>>>       {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
>>> +     {Opt_osd_request_timeout, "osd_request_timeout=%d"},
>>>       /* int args above */
>>>       {Opt_fsid, "fsid=%s"},
>>>       {Opt_name, "name=%s"},
>>> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
>>>       opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
>>>       opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
>>>       opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
>>> +     opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
>>>
>>>       /* get mon ip(s) */
>>>       /* ip1[:port1][,ip2[:port2]...] */
>>> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
>>>                       }
>>>                       opt->mount_timeout = msecs_to_jiffies(intval * 1000);
>>>                       break;
>>> +             case Opt_osd_request_timeout:
>>> +                     /* 0 is "wait forever" (i.e. infinite timeout) */
>>> +                     if (intval < 0 || intval > INT_MAX / 1000) {
>>> +                             pr_err("osd_request_timeout out of range\n");
>>> +                             err = -EINVAL;
>>> +                             goto out;
>>> +                     }
>>> +                     opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
>>> +                     break;
>>>
>>>               case Opt_share:
>>>                       opt->flags &= ~CEPH_OPT_NOSHARE;
>>> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
>>> index 842f049..2f5a024 100644
>>> --- a/net/ceph/osd_client.c
>>> +++ b/net/ceph/osd_client.c
>>> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work)
>>>                             osdc->client->options->osd_keepalive_timeout);
>>>   }
>>>
>>> +/*
>>> + * Complete request to OSD with error err if it started before cutoff.
>>> + */
>>> +static void complete_osd_stuck_request(struct ceph_osd_request *req,
>>> +                                    unsigned long cutoff,
>>> +                                    int err)
>>> +{
>>> +     if (!time_before(req->r_stamp, cutoff))
>>> +             return;
>>> +
>>> +     pr_warn_ratelimited("osd req on osd%d timeout expired\n",
>>> +                         req->r_osd->o_osd);
>>> +
>>> +     complete_request(req, err);
>>> +}
>>> +
>>> +/*
>>> + * Complete all requests to OSD that has been active for more than timeout.
>>> + */
>>> +static void complete_osd_stuck_requests(struct ceph_osd *osd,
>>> +                                     unsigned long timeout,
>>> +                                     int err)
>>> +{
>>> +     struct ceph_osd_request *req, *n;
>>> +     const unsigned long cutoff = jiffies - timeout;
>>> +
>>> +     rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) {
>>> +             complete_osd_stuck_request(req, cutoff, -EIO);
>>> +     }
>>> +}
>>> +
>>> +/*
>>> + * Timeout callback, called every N seconds. When 1 or more OSD
>>> + * requests that has been active for more than N seconds,
>>> + * we complete it with error EIO.
>>> + */
>>> +static void handle_osd_request_timeout(struct work_struct *work)
>>> +{
>>> +     struct ceph_osd_client *osdc =
>>> +             container_of(work, struct ceph_osd_client,
>>> +                          osd_request_timeout_work.work);
>>> +     struct ceph_osd *osd, *n;
>>> +     struct ceph_options *opts = osdc->client->options;
>>> +     const unsigned long timeout = opts->osd_request_timeout;
>>> +
>>> +     dout("%s osdc %p\n", __func__, osdc);
>>> +
>>> +     if (!timeout)
>>> +             return;
>>> +
>>> +     down_write(&osdc->lock);
>>> +
>>> +     rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) {
>>> +             complete_osd_stuck_requests(osd, timeout, -EIO);
>>> +     }
>>> +
>>> +     up_write(&osdc->lock);
>>> +
>>> +     if (!atomic_read(&osdc->num_homeless))
>>> +             goto out;
>>> +
>>> +     down_write(&osdc->lock);
>>> +     complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO);
>>> +     up_write(&osdc->lock);
>>> +
>>> +out:
>>> +     schedule_delayed_work(&osdc->osd_request_timeout_work, timeout);
>>> +}
>>> +
>>>   static void handle_osds_timeout(struct work_struct *work)
>>>   {
>>>       struct ceph_osd_client *osdc =
>>> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>>       osdc->linger_map_checks = RB_ROOT;
>>>       INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
>>>       INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
>>> +     INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout);
>>>
>>>       err = -ENOMEM;
>>>       osdc->osdmap = ceph_osdmap_alloc();
>>> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct
>>> ceph_client *client)
>>>                             osdc->client->options->osd_keepalive_timeout);
>>>       schedule_delayed_work(&osdc->osds_timeout_work,
>>>           round_jiffies_relative(osdc->client->options->osd_idle_ttl));
>>> +     schedule_delayed_work(&osdc->osd_request_timeout_work,
>>> +         round_jiffies_relative(osdc->client->options->osd_request_timeout));
>>>
>>>       return 0;
>>>
>>
>> Having a job that has to wake up every second or so isn't ideal. Perhaps
>> you would be better off scheduling the delayed work in the request
>> submission codepath, and only rearm it when the tree isn't empty after
>> calling complete_osd_stuck_requests?
>>
>> Also, I don't see where this job is ever cancelled when the osdc is torn
>> down. That needs to occur or you'll cause a use-after-free oops...
>
> Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe()
> can't be used because complete_request() calls erase(), etc.  The patch
> I had just piggy-backed on handle_timeout(), which is called every five
> seconds anyway.
>
> Given that there is interest, I think it's worth adding.

Hi Artur,

How about the attached patch?  handle_timeout() is going to iterate
over all but the homeless OSD anyway; all it costs us is a couple of
tests, so I don't think a separate work is needed.

abort_request() is a simple wrapper around complete_request(), making
it safe to call at any time -- replace it with complete_request() for
now if you want to try this out.

Thanks,

                Ilya
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 1816c5e26581..88cd5dc8e238 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -48,6 +48,7 @@ struct ceph_options {
 	unsigned long mount_timeout;		/* jiffies */
 	unsigned long osd_idle_ttl;		/* jiffies */
 	unsigned long osd_keepalive_timeout;	/* jiffies */
+	unsigned long osd_request_timeout;	/* jiffies */
 
 	/*
 	 * any type that can't be simply compared or doesn't need need
@@ -68,6 +69,7 @@ struct ceph_options {
 #define CEPH_MOUNT_TIMEOUT_DEFAULT	msecs_to_jiffies(60 * 1000)
 #define CEPH_OSD_KEEPALIVE_DEFAULT	msecs_to_jiffies(5 * 1000)
 #define CEPH_OSD_IDLE_TTL_DEFAULT	msecs_to_jiffies(60 * 1000)
+#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT 0  /* no timeout */
 
 #define CEPH_MONC_HUNT_INTERVAL		msecs_to_jiffies(3 * 1000)
 #define CEPH_MONC_PING_INTERVAL		msecs_to_jiffies(10 * 1000)
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 03a6653d329a..4db031dd748d 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -193,6 +193,7 @@ struct ceph_osd_request {
 
 	/* internal */
 	unsigned long r_stamp;                /* jiffies, send or check time */
+	unsigned long r_start_stamp;          /* jiffies */
 	int r_attempts;
 	struct ceph_eversion r_replay_version; /* aka reassert_version */
 	u32 r_last_force_resend;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 464e88599b9d..108533859a53 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -230,6 +230,7 @@ enum {
 	Opt_osdkeepalivetimeout,
 	Opt_mount_timeout,
 	Opt_osd_idle_ttl,
+	Opt_osd_request_timeout,
 	Opt_last_int,
 	/* int args above */
 	Opt_fsid,
@@ -256,6 +257,7 @@ static match_table_t opt_tokens = {
 	{Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
 	{Opt_mount_timeout, "mount_timeout=%d"},
 	{Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+	{Opt_osd_request_timeout, "osd_request_timeout=%d"},
 	/* int args above */
 	{Opt_fsid, "fsid=%s"},
 	{Opt_name, "name=%s"},
@@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name,
 	opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
 	opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
 	opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
+	opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT;
 
 	/* get mon ip(s) */
 	/* ip1[:port1][,ip2[:port2]...] */
@@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name,
 			}
 			opt->mount_timeout = msecs_to_jiffies(intval * 1000);
 			break;
+		case Opt_osd_request_timeout:
+			/* 0 is "wait forever" (i.e. infinite timeout) */
+			if (intval < 0 || intval > INT_MAX / 1000) {
+				pr_err("osd_request_timeout out of range\n");
+				err = -EINVAL;
+				goto out;
+			}
+			opt->osd_request_timeout = msecs_to_jiffies(intval * 1000);
+			break;
 
 		case Opt_share:
 			opt->flags &= ~CEPH_OPT_NOSHARE;
@@ -557,6 +569,9 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
 	if (opt->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
 		seq_printf(m, "osdkeepalivetimeout=%d,",
 		    jiffies_to_msecs(opt->osd_keepalive_timeout) / 1000);
+	if (opt->osd_request_timeout != CEPH_OSD_REQUEST_TIMEOUT_DEFAULT)
+		seq_printf(m, "osd_request_timeout=%d,",
+			   jiffies_to_msecs(opt->osd_request_timeout) / 1000);
 
 	/* drop redundant comma */
 	if (m->count != pos)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index b8da4279e745..1d580530c700 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -1717,6 +1717,8 @@ static void account_request(struct ceph_osd_request *req)
 
 	WARN_ON(req->r_unsafe_callback && (req->r_flags & mask) != mask);
 	atomic_inc(&req->r_osdc->num_requests);
+
+	req->r_start_stamp = jiffies;
 }
 
 static void submit_request(struct ceph_osd_request *req, bool wrlocked)
@@ -2504,6 +2506,7 @@ static void handle_timeout(struct work_struct *work)
 		container_of(work, struct ceph_osd_client, timeout_work.work);
 	struct ceph_options *opts = osdc->client->options;
 	unsigned long cutoff = jiffies - opts->osd_keepalive_timeout;
+	unsigned long expiry_cutoff = jiffies - opts->osd_request_timeout;
 	LIST_HEAD(slow_osds);
 	struct rb_node *n, *p;
 
@@ -2519,15 +2522,23 @@ static void handle_timeout(struct work_struct *work)
 		struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
 		bool found = false;
 
-		for (p = rb_first(&osd->o_requests); p; p = rb_next(p)) {
+		for (p = rb_first(&osd->o_requests); p; ) {
 			struct ceph_osd_request *req =
 			    rb_entry(p, struct ceph_osd_request, r_node);
 
+			p = rb_next(p); /* abort_request() */
+
 			if (time_before(req->r_stamp, cutoff)) {
 				dout(" req %p tid %llu on osd%d is laggy\n",
 				     req, req->r_tid, osd->o_osd);
 				found = true;
 			}
+			if (opts->osd_request_timeout &&
+			    time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osd->o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
 		}
 		for (p = rb_first(&osd->o_linger_requests); p; p = rb_next(p)) {
 			struct ceph_osd_linger_request *lreq =
@@ -2547,6 +2558,21 @@ static void handle_timeout(struct work_struct *work)
 			list_move_tail(&osd->o_keepalive_item, &slow_osds);
 	}
 
+	if (opts->osd_request_timeout) {
+		for (p = rb_first(&osdc->homeless_osd.o_requests); p; ) {
+			struct ceph_osd_request *req =
+			    rb_entry(p, struct ceph_osd_request, r_node);
+
+			p = rb_next(p); /* abort_request() */
+
+			if (time_before(req->r_start_stamp, expiry_cutoff)) {
+				pr_err_ratelimited("tid %llu on osd%d timeout\n",
+				       req->r_tid, osdc->homeless_osd.o_osd);
+				abort_request(req, -ETIMEDOUT);
+			}
+		}
+	}
+
 	if (atomic_read(&osdc->num_homeless) || !list_empty(&slow_osds))
 		maybe_request_map(osdc);
 

[Index of Archives]     [CEPH Users]     [Ceph Large]     [Information on CEPH]     [Linux BTRFS]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]
  Powered by Linux