From: Cyril Bordage <cbordage@xxxxxxxxxxxxx> Add a tunable parameter to limit the recovery ping interval which was previously statically set to 900. This can be done by using: lnetctl set max_recovery_ping_interval <value> WC-bug-id: https://jira.whamcloud.com/browse/LU-14979 Lustre-commit: 4027395fe463b6ea1 ("LU-14979 lnet: set max recovery interval duration") Signed-off-by: Cyril Bordage <cbordage@xxxxxxxxxxxxx> Signed-off-by: Chris Horn <chris.horn@xxxxxxx> Reviewed-on: https://review.whamcloud.com/44927 Reviewed-by: Serguei Smirnov <ssmirnov@xxxxxxxxxxxxx> Reviewed-by: Frank Sehr <fsehr@xxxxxxxxxxxxx> Reviewed-by: Oleg Drokin <green@xxxxxxxxxxxxx> Signed-off-by: James Simmons <jsimmons@xxxxxxxxxxxxx> --- include/linux/lnet/lib-lnet.h | 9 ++++---- net/lnet/lnet/api-ni.c | 49 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 4 deletions(-) diff --git a/include/linux/lnet/lib-lnet.h b/include/linux/lnet/lib-lnet.h index ceb12b1..e21866b 100644 --- a/include/linux/lnet/lib-lnet.h +++ b/include/linux/lnet/lib-lnet.h @@ -559,6 +559,8 @@ unsigned int lnet_nid_cpt_hash(struct lnet_nid *nid, extern unsigned int lnet_recovery_limit; extern unsigned int lnet_peer_discovery_disabled; extern unsigned int lnet_drop_asym_route; +extern unsigned int lnet_max_recovery_ping_interval; +extern unsigned int lnet_max_recovery_ping_count; extern unsigned int router_sensitivity_percentage; extern int alive_router_check_interval; extern int live_router_check_interval; @@ -1009,15 +1011,14 @@ int lnet_get_peer_ni_info(u32 peer_index, u64 *nid, return false; } -#define LNET_RECOVERY_INTERVAL_MAX 900 static inline unsigned int lnet_get_next_recovery_ping(unsigned int ping_count, time64_t now) { unsigned int interval; - /* 2^9 = 512, 2^10 = 1024 */ - if (ping_count > 9) - interval = LNET_RECOVERY_INTERVAL_MAX; + /* lnet_max_recovery_interval <= 2^lnet_max_recovery_ping_count */ + if (ping_count > lnet_max_recovery_ping_count) + interval = lnet_max_recovery_ping_interval; else interval = 1 << ping_count; diff --git a/net/lnet/lnet/api-ni.c b/net/lnet/lnet/api-ni.c index 8643ac8d..165728d 100644 --- a/net/lnet/lnet/api-ni.c +++ b/net/lnet/lnet/api-ni.c @@ -117,6 +117,22 @@ static int recovery_interval_set(const char *val, MODULE_PARM_DESC(lnet_recovery_limit, "How long to attempt recovery of unhealthy peer interfaces in seconds. Set to 0 to allow indefinite recovery"); +unsigned int lnet_max_recovery_ping_interval = 900; +unsigned int lnet_max_recovery_ping_count = 9; +static int max_recovery_ping_interval_set(const char *val, + const struct kernel_param *kp); + +#define param_check_max_recovery_ping_interval(name, p) \ + __param_check(name, p, int) + +static struct kernel_param_ops param_ops_max_recovery_ping_interval = { + .set = max_recovery_ping_interval_set, + .get = param_get_int, +}; +module_param(lnet_max_recovery_ping_interval, max_recovery_ping_interval, 0644); +MODULE_PARM_DESC(lnet_max_recovery_ping_interval, + "The max interval between LNet recovery pings, in seconds"); + static int lnet_interfaces_max = LNET_INTERFACES_MAX_DEFAULT; static int intf_max_set(const char *val, const struct kernel_param *kp); module_param_call(lnet_interfaces_max, intf_max_set, param_get_int, @@ -258,6 +274,39 @@ static int lnet_discover(struct lnet_process_id id, u32 force, } static int +max_recovery_ping_interval_set(const char *val, const struct kernel_param *kp) +{ + int rc; + unsigned long value; + + rc = kstrtoul(val, 0, &value); + if (rc) { + CERROR("Invalid module parameter value for 'lnet_max_recovery_ping_interval'\n"); + return rc; + } + + if (!value) { + CERROR("Invalid max ping timeout. Must be strictly positive\n"); + return -EINVAL; + } + + /* The purpose of locking the api_mutex here is to ensure that + * the correct value ends up stored properly. + */ + mutex_lock(&the_lnet.ln_api_mutex); + lnet_max_recovery_ping_interval = value; + lnet_max_recovery_ping_count = 0; + value >>= 1; + while (value) { + lnet_max_recovery_ping_count++; + value >>= 1; + } + mutex_unlock(&the_lnet.ln_api_mutex); + + return 0; +} + +static int discovery_set(const char *val, const struct kernel_param *kp) { int rc; -- 1.8.3.1