On Fri, Feb 10, 2017 at 12:31 PM, Jeff Layton <jlayton@xxxxxxxxxx> wrote: > On Thu, 2017-02-09 at 16:04 +0300, Artur Molchanov wrote: >> From: Artur Molchanov <artur.molchanov@xxxxxxxxxx> >> >> Complete stuck requests to OSD with error EIO after osd_request_timeout expired. >> If osd_request_timeout equals to 0 (default value) then do nothing with >> hung requests (keep default behavior). >> >> Create RBD map option osd_request_timeout to set timeout in seconds. Set >> osd_request_timeout to 0 by default. >> > Also, what exactly are the requests blocked on when this occurs? Is the > ceph_osd_request_target ending up paused? I wonder if we might be better > off with something that returns a hard error under the circumstances > where you're hanging, rather than depending on timeouts. > > >> Signed-off-by: Artur Molchanov <artur.molchanov@xxxxxxxxxx> >> --- >> include/linux/ceph/libceph.h | 8 +++-- >> include/linux/ceph/osd_client.h | 1 + >> net/ceph/ceph_common.c | 12 +++++++ >> net/ceph/osd_client.c | 72 +++++++++++++++++++++++++++++++++++++++++ >> 4 files changed, 90 insertions(+), 3 deletions(-) >> >> diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h >> index 1816c5e..10d8acb 100644 >> --- a/include/linux/ceph/libceph.h >> +++ b/include/linux/ceph/libceph.h >> @@ -48,6 +48,7 @@ struct ceph_options { >> unsigned long mount_timeout; /* jiffies */ >> unsigned long osd_idle_ttl; /* jiffies */ >> unsigned long osd_keepalive_timeout; /* jiffies */ >> + unsigned long osd_request_timeout; /* jiffies */ >> >> /* >> * any type that can't be simply compared or doesn't need need >> @@ -65,9 +66,10 @@ struct ceph_options { >> /* >> * defaults >> */ >> -#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) >> -#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) >> -#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) >> +#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) >> +#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) >> +#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) >> +#define CEPH_OSD_REQUEST_TIMEOUT_DEFAULT msecs_to_jiffies(0 * 1000) >> >> #define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000) >> #define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000) >> diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h >> index 03a6653..e45cba0 100644 >> --- a/include/linux/ceph/osd_client.h >> +++ b/include/linux/ceph/osd_client.h >> @@ -279,6 +279,7 @@ struct ceph_osd_client { >> atomic_t num_homeless; >> struct delayed_work timeout_work; >> struct delayed_work osds_timeout_work; >> + struct delayed_work osd_request_timeout_work; >> #ifdef CONFIG_DEBUG_FS >> struct dentry *debugfs_file; >> #endif >> diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c >> index 464e885..81876c8 100644 >> --- a/net/ceph/ceph_common.c >> +++ b/net/ceph/ceph_common.c >> @@ -230,6 +230,7 @@ enum { >> Opt_osdkeepalivetimeout, >> Opt_mount_timeout, >> Opt_osd_idle_ttl, >> + Opt_osd_request_timeout, >> Opt_last_int, >> /* int args above */ >> Opt_fsid, >> @@ -256,6 +257,7 @@ static match_table_t opt_tokens = { >> {Opt_osdkeepalivetimeout, "osdkeepalive=%d"}, >> {Opt_mount_timeout, "mount_timeout=%d"}, >> {Opt_osd_idle_ttl, "osd_idle_ttl=%d"}, >> + {Opt_osd_request_timeout, "osd_request_timeout=%d"}, >> /* int args above */ >> {Opt_fsid, "fsid=%s"}, >> {Opt_name, "name=%s"}, >> @@ -361,6 +363,7 @@ ceph_parse_options(char *options, const char *dev_name, >> opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; >> opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; >> opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; >> + opt->osd_request_timeout = CEPH_OSD_REQUEST_TIMEOUT_DEFAULT; >> >> /* get mon ip(s) */ >> /* ip1[:port1][,ip2[:port2]...] */ >> @@ -473,6 +476,15 @@ ceph_parse_options(char *options, const char *dev_name, >> } >> opt->mount_timeout = msecs_to_jiffies(intval * 1000); >> break; >> + case Opt_osd_request_timeout: >> + /* 0 is "wait forever" (i.e. infinite timeout) */ >> + if (intval < 0 || intval > INT_MAX / 1000) { >> + pr_err("osd_request_timeout out of range\n"); >> + err = -EINVAL; >> + goto out; >> + } >> + opt->osd_request_timeout = msecs_to_jiffies(intval * 1000); >> + break; >> >> case Opt_share: >> opt->flags &= ~CEPH_OPT_NOSHARE; >> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c >> index 842f049..2f5a024 100644 >> --- a/net/ceph/osd_client.c >> +++ b/net/ceph/osd_client.c >> @@ -2554,6 +2554,75 @@ static void handle_timeout(struct work_struct *work) >> osdc->client->options->osd_keepalive_timeout); >> } >> >> +/* >> + * Complete request to OSD with error err if it started before cutoff. >> + */ >> +static void complete_osd_stuck_request(struct ceph_osd_request *req, >> + unsigned long cutoff, >> + int err) >> +{ >> + if (!time_before(req->r_stamp, cutoff)) >> + return; >> + >> + pr_warn_ratelimited("osd req on osd%d timeout expired\n", >> + req->r_osd->o_osd); >> + >> + complete_request(req, err); >> +} >> + >> +/* >> + * Complete all requests to OSD that has been active for more than timeout. >> + */ >> +static void complete_osd_stuck_requests(struct ceph_osd *osd, >> + unsigned long timeout, >> + int err) >> +{ >> + struct ceph_osd_request *req, *n; >> + const unsigned long cutoff = jiffies - timeout; >> + >> + rbtree_postorder_for_each_entry_safe(req, n, &osd->o_requests, r_node) { >> + complete_osd_stuck_request(req, cutoff, -EIO); >> + } >> +} >> + >> +/* >> + * Timeout callback, called every N seconds. When 1 or more OSD >> + * requests that has been active for more than N seconds, >> + * we complete it with error EIO. >> + */ >> +static void handle_osd_request_timeout(struct work_struct *work) >> +{ >> + struct ceph_osd_client *osdc = >> + container_of(work, struct ceph_osd_client, >> + osd_request_timeout_work.work); >> + struct ceph_osd *osd, *n; >> + struct ceph_options *opts = osdc->client->options; >> + const unsigned long timeout = opts->osd_request_timeout; >> + >> + dout("%s osdc %p\n", __func__, osdc); >> + >> + if (!timeout) >> + return; >> + >> + down_write(&osdc->lock); >> + >> + rbtree_postorder_for_each_entry_safe(osd, n, &osdc->osds, o_node) { >> + complete_osd_stuck_requests(osd, timeout, -EIO); >> + } >> + >> + up_write(&osdc->lock); >> + >> + if (!atomic_read(&osdc->num_homeless)) >> + goto out; >> + >> + down_write(&osdc->lock); >> + complete_osd_stuck_requests(&osdc->homeless_osd, timeout, -EIO); >> + up_write(&osdc->lock); >> + >> +out: >> + schedule_delayed_work(&osdc->osd_request_timeout_work, timeout); >> +} >> + >> static void handle_osds_timeout(struct work_struct *work) >> { >> struct ceph_osd_client *osdc = >> @@ -4091,6 +4160,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct >> ceph_client *client) >> osdc->linger_map_checks = RB_ROOT; >> INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout); >> INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout); >> + INIT_DELAYED_WORK(&osdc->osd_request_timeout_work, handle_osd_request_timeout); >> >> err = -ENOMEM; >> osdc->osdmap = ceph_osdmap_alloc(); >> @@ -4120,6 +4190,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct >> ceph_client *client) >> osdc->client->options->osd_keepalive_timeout); >> schedule_delayed_work(&osdc->osds_timeout_work, >> round_jiffies_relative(osdc->client->options->osd_idle_ttl)); >> + schedule_delayed_work(&osdc->osd_request_timeout_work, >> + round_jiffies_relative(osdc->client->options->osd_request_timeout)); >> >> return 0; >> > > Having a job that has to wake up every second or so isn't ideal. Perhaps > you would be better off scheduling the delayed work in the request > submission codepath, and only rearm it when the tree isn't empty after > calling complete_osd_stuck_requests? > > Also, I don't see where this job is ever cancelled when the osdc is torn > down. That needs to occur or you'll cause a use-after-free oops... Yeah, there are other bugs here: rbtree_postorder_for_each_entry_safe() can't be used because complete_request() calls erase(), etc. The patch I had just piggy-backed on handle_timeout(), which is called every five seconds anyway. Given that there is interest, I think it's worth adding. Thanks, Ilya -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html