On Fri, Oct 30, 2020 at 09:00:48PM +0000, Martin Wilck wrote: > On Fri, 2020-10-23 at 16:15 -0500, Benjamin Marzinski wrote: > > There are times a fc rport is never lost, meaning that > > fast_io_fail_tmo > > and dev_loss_tmo never trigger, but scsi commands still hang. This > > can > > cause problems in cases where users have string timing requirements, > > Did you mean "strict" here? > Whoops. yes. > > and > > the easiest way to solve these issues is to set eh_deadline. Since > > it's > > already possible to set fast_io_fail_tmo and dev_loss_tmo from > > multipath.conf, and have multipath take care of setting it correctly > > for > > the scsi devices in sysfs, it makes sense to allow users to set > > eh_deadline here as well. > > > > Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx> > > --- > > libmultipath/config.c | 2 ++ > > libmultipath/config.h | 2 ++ > > libmultipath/configure.c | 1 + > > libmultipath/dict.c | 10 ++++++ > > libmultipath/discovery.c | 70 ++++++++++++++++++++++++++++++---- > > ---- > > libmultipath/propsel.c | 17 +++++++++ > > libmultipath/propsel.h | 1 + > > libmultipath/structs.h | 7 ++++ > > multipath/multipath.conf.5 | 16 +++++++++ > > 9 files changed, 111 insertions(+), 15 deletions(-) > > > > diff --git a/libmultipath/config.c b/libmultipath/config.c > > index 49e7fb81..9f3cb38d 100644 > > --- a/libmultipath/config.c > > +++ b/libmultipath/config.c > > @@ -424,6 +424,7 @@ merge_hwe (struct hwentry * dst, struct hwentry * > > src) > > merge_num(flush_on_last_del); > > merge_num(fast_io_fail); > > merge_num(dev_loss); > > + merge_num(eh_deadline); > > merge_num(user_friendly_names); > > merge_num(retain_hwhandler); > > merge_num(detect_prio); > > @@ -579,6 +580,7 @@ store_hwe (vector hwtable, struct hwentry * dhwe) > > hwe->flush_on_last_del = dhwe->flush_on_last_del; > > hwe->fast_io_fail = dhwe->fast_io_fail; > > hwe->dev_loss = dhwe->dev_loss; > > + hwe->eh_deadline = dhwe->eh_deadline; > > hwe->user_friendly_names = dhwe->user_friendly_names; > > hwe->retain_hwhandler = dhwe->retain_hwhandler; > > hwe->detect_prio = dhwe->detect_prio; > > diff --git a/libmultipath/config.h b/libmultipath/config.h > > index 661dd586..9ce37f16 100644 > > --- a/libmultipath/config.h > > +++ b/libmultipath/config.h > > @@ -63,6 +63,7 @@ struct hwentry { > > int flush_on_last_del; > > int fast_io_fail; > > unsigned int dev_loss; > > + int eh_deadline; > > int user_friendly_names; > > int retain_hwhandler; > > int detect_prio; > > @@ -148,6 +149,7 @@ struct config { > > int attribute_flags; > > int fast_io_fail; > > unsigned int dev_loss; > > + int eh_deadline; > > int log_checker_err; > > int allow_queueing; > > int allow_usb_devices; > > diff --git a/libmultipath/configure.c b/libmultipath/configure.c > > index 1c8aac08..a878d870 100644 > > --- a/libmultipath/configure.c > > +++ b/libmultipath/configure.c > > @@ -368,6 +368,7 @@ int setup_map(struct multipath *mpp, char > > *params, int params_size, > > select_gid(conf, mpp); > > select_fast_io_fail(conf, mpp); > > select_dev_loss(conf, mpp); > > + select_eh_deadline(conf, mpp); > > select_reservation_key(conf, mpp); > > select_deferred_remove(conf, mpp); > > select_marginal_path_err_sample_time(conf, mpp); > > diff --git a/libmultipath/dict.c b/libmultipath/dict.c > > index f4357da1..bab96146 100644 > > --- a/libmultipath/dict.c > > +++ b/libmultipath/dict.c > > @@ -899,6 +899,13 @@ declare_ovr_snprint(dev_loss, print_dev_loss) > > declare_hw_handler(dev_loss, set_dev_loss) > > declare_hw_snprint(dev_loss, print_dev_loss) > > > > +declare_def_handler(eh_deadline, set_undef_off_zero) > > +declare_def_snprint(eh_deadline, print_undef_off_zero) > > +declare_ovr_handler(eh_deadline, set_undef_off_zero) > > +declare_ovr_snprint(eh_deadline, print_undef_off_zero) > > +declare_hw_handler(eh_deadline, set_undef_off_zero) > > +declare_hw_snprint(eh_deadline, print_undef_off_zero) > > + > > static int > > set_pgpolicy(vector strvec, void *ptr) > > { > > @@ -1771,6 +1778,7 @@ init_keywords(vector keywords) > > install_keyword("gid", &def_gid_handler, &snprint_def_gid); > > install_keyword("fast_io_fail_tmo", &def_fast_io_fail_handler, > > &snprint_def_fast_io_fail); > > install_keyword("dev_loss_tmo", &def_dev_loss_handler, > > &snprint_def_dev_loss); > > + install_keyword("eh_deadline", &def_eh_deadline_handler, > > &snprint_def_eh_deadline); > > install_keyword("bindings_file", &def_bindings_file_handler, > > &snprint_def_bindings_file); > > install_keyword("wwids_file", &def_wwids_file_handler, > > &snprint_def_wwids_file); > > install_keyword("prkeys_file", &def_prkeys_file_handler, > > &snprint_def_prkeys_file); > > @@ -1880,6 +1888,7 @@ init_keywords(vector keywords) > > install_keyword("flush_on_last_del", > > &hw_flush_on_last_del_handler, &snprint_hw_flush_on_last_del); > > install_keyword("fast_io_fail_tmo", &hw_fast_io_fail_handler, > > &snprint_hw_fast_io_fail); > > install_keyword("dev_loss_tmo", &hw_dev_loss_handler, > > &snprint_hw_dev_loss); > > + install_keyword("eh_deadline", &hw_eh_deadline_handler, > > &snprint_hw_eh_deadline); > > install_keyword("user_friendly_names", > > &hw_user_friendly_names_handler, &snprint_hw_user_friendly_names); > > install_keyword("retain_attached_hw_handler", > > &hw_retain_hwhandler_handler, &snprint_hw_retain_hwhandler); > > install_keyword("detect_prio", &hw_detect_prio_handler, > > &snprint_hw_detect_prio); > > @@ -1920,6 +1929,7 @@ init_keywords(vector keywords) > > install_keyword("flush_on_last_del", > > &ovr_flush_on_last_del_handler, &snprint_ovr_flush_on_last_del); > > install_keyword("fast_io_fail_tmo", &ovr_fast_io_fail_handler, > > &snprint_ovr_fast_io_fail); > > install_keyword("dev_loss_tmo", &ovr_dev_loss_handler, > > &snprint_ovr_dev_loss); > > + install_keyword("eh_deadline", &ovr_eh_deadline_handler, > > &snprint_ovr_eh_deadline); > > install_keyword("user_friendly_names", > > &ovr_user_friendly_names_handler, &snprint_ovr_user_friendly_names); > > install_keyword("retain_attached_hw_handler", > > &ovr_retain_hwhandler_handler, &snprint_ovr_retain_hwhandler); > > install_keyword("detect_prio", &ovr_detect_prio_handler, > > &snprint_ovr_detect_prio); > > diff --git a/libmultipath/discovery.c b/libmultipath/discovery.c > > index 950b1586..ef9a9a23 100644 > > --- a/libmultipath/discovery.c > > +++ b/libmultipath/discovery.c > > @@ -585,6 +585,42 @@ sysfs_get_asymmetric_access_state(struct path > > *pp, char *buff, int buflen) > > return !!preferred; > > } > > > > +static int > > +sysfs_set_eh_deadline(struct multipath *mpp, struct path *pp) > > +{ > > + struct udev_device *hostdev; > > + char host_name[HOST_NAME_LEN], value[16]; > > + int ret; > > + > > + if (mpp->eh_deadline == EH_DEADLINE_UNSET) > > + return 0; > > + > > + sprintf(host_name, "host%d", pp->sg_id.host_no); > > + hostdev = udev_device_new_from_subsystem_sysname(udev, > > + "scsi_host", host_name); > > + if (!hostdev) > > + return 1; > > + > > + if (mpp->eh_deadline == EH_DEADLINE_OFF) > > + sprintf(value, "off"); > > + else if (mpp->eh_deadline == EH_DEADLINE_ZERO) > > + sprintf(value, "0"); > > + else > > + snprintf(value, 16, "%u", mpp->eh_deadline); > > Hm, mpp->eh_deadline is an "int". This should cause a compiler warning, > does it not? um.. no. Not on either gcc-8 or gcc-10. But I'll fix it. > Nitpick: You're certain to not need more than 11 bytes of string > buffer, so you could simply use sprintf(). If you want to use > snprintf(), please pass sizeof(value) rather than an explicit number. Sure. > > + > > + ret = sysfs_attr_set_value(hostdev, "eh_deadline", > > + value, strlen(value)); > > Nitpick: you could use the return value of sprintf() here rather than > calling strlen() again. Sure > > + /* > > + * not all scsi drivers support setting eh_deadline, so failing > > + * is totally reasonable > > + */ > > + if (ret <= 0) > > + condlog(4, "%s: failed to set eh_deadline to %s, error > > %d", udev_device_get_sysname(hostdev), value, -ret); > > Loglevel 3 might still be justified. After all, the user did set > eh_deadline, and might wonder why her settings aren't applied. makes sense. > > + > > + udev_device_unref(hostdev); > > + return (ret <= 0); > > +} > > + > > static void > > sysfs_set_rport_tmo(struct multipath *mpp, struct path *pp) > > { > > @@ -799,7 +835,8 @@ sysfs_set_scsi_tmo (struct multipath *mpp, > > unsigned int checkint) > > mpp->fast_io_fail = MP_FAST_IO_FAIL_OFF; > > } > > if (mpp->dev_loss == DEV_LOSS_TMO_UNSET && > > - mpp->fast_io_fail == MP_FAST_IO_FAIL_UNSET) > > + mpp->fast_io_fail == MP_FAST_IO_FAIL_UNSET && > > + mpp->eh_deadline == EH_DEADLINE_UNSET) > > return 0; > > > > vector_foreach_slot(mpp->paths, pp, i) { > > @@ -808,21 +845,24 @@ sysfs_set_scsi_tmo (struct multipath *mpp, > > unsigned int checkint) > > err_path = pp; > > continue; > > } > > - > > - switch (pp->sg_id.proto_id) { > > - case SCSI_PROTOCOL_FCP: > > - sysfs_set_rport_tmo(mpp, pp); > > - continue; > > - case SCSI_PROTOCOL_ISCSI: > > - sysfs_set_session_tmo(mpp, pp); > > - continue; > > - case SCSI_PROTOCOL_SAS: > > - sysfs_set_nexus_loss_tmo(mpp, pp); > > - continue; > > - default: > > - if (!err_path) > > - err_path = pp; > > + if (mpp->dev_loss != DEV_LOSS_TMO_UNSET || > > + mpp->fast_io_fail != MP_FAST_IO_FAIL_UNSET) { > > Perhaps we should rather check these in the called functions then > introduce another conditional here. Some callees do check these > already. We've got the likely shortcut (none set) above already. Sure. > > + switch (pp->sg_id.proto_id) { > > + case SCSI_PROTOCOL_FCP: > > + sysfs_set_rport_tmo(mpp, pp); > > + break; > > + case SCSI_PROTOCOL_ISCSI: > > + sysfs_set_session_tmo(mpp, pp); > > + break; > > + case SCSI_PROTOCOL_SAS: > > + sysfs_set_nexus_loss_tmo(mpp, pp); > > + break; > > + default: > > + if (!err_path) > > + err_path = pp; > > + } > > } > > + sysfs_set_eh_deadline(mpp, pp); > > } > > > > if (err_path) { > > diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c > > index 67d025cf..fa4ac5d9 100644 > > --- a/libmultipath/propsel.c > > +++ b/libmultipath/propsel.c > > @@ -775,6 +775,23 @@ out: > > return 0; > > } > > > > +int select_eh_deadline(struct config *conf, struct multipath *mp) > > +{ > > + const char *origin; > > + char buff[12]; > > + > > + mp_set_ovr(eh_deadline); > > + mp_set_hwe(eh_deadline); > > + mp_set_conf(eh_deadline); > > + mp->eh_deadline = EH_DEADLINE_UNSET; > > + /* not changing sysfs in default cause, so don't print anything > > */ > > + return 0; > > +out: > > + print_undef_off_zero(buff, 12, mp->eh_deadline); > > + condlog(3, "%s: eh_deadline = %s %s", mp->alias, buff, origin); > > + return 0; > > +} > > + > > int select_flush_on_last_del(struct config *conf, struct multipath > > *mp) > > { > > const char *origin; > > diff --git a/libmultipath/propsel.h b/libmultipath/propsel.h > > index 3d6edd8a..a68bacf0 100644 > > --- a/libmultipath/propsel.h > > +++ b/libmultipath/propsel.h > > @@ -17,6 +17,7 @@ int select_uid(struct config *conf, struct > > multipath *mp); > > int select_gid(struct config *conf, struct multipath *mp); > > int select_fast_io_fail(struct config *conf, struct multipath *mp); > > int select_dev_loss(struct config *conf, struct multipath *mp); > > +int select_eh_deadline(struct config *conf, struct multipath *mp); > > int select_reservation_key(struct config *conf, struct multipath > > *mp); > > int select_retain_hwhandler (struct config *conf, struct multipath * > > mp); > > int select_detect_prio(struct config *conf, struct path * pp); > > diff --git a/libmultipath/structs.h b/libmultipath/structs.h > > index cfa7b649..d6ff6762 100644 > > --- a/libmultipath/structs.h > > +++ b/libmultipath/structs.h > > @@ -236,6 +236,12 @@ enum fast_io_fail_states { > > MP_FAST_IO_FAIL_ZERO = UOZ_ZERO, > > }; > > > > +enum eh_deadline_states { > > + EH_DEADLINE_UNSET = UOZ_UNDEF, > > + EH_DEADLINE_OFF = UOZ_OFF, > > + EH_DEADLINE_ZERO = UOZ_ZERO, > > +}; > > + > > struct vpd_vendor_page { > > int pg; > > const char *name; > > @@ -356,6 +362,7 @@ struct multipath { > > int ghost_delay; > > int ghost_delay_tick; > > unsigned int dev_loss; > > + int eh_deadline; > > uid_t uid; > > gid_t gid; > > mode_t mode; > > diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5 > > index d2101ed6..cf05c1ab 100644 > > --- a/multipath/multipath.conf.5 > > +++ b/multipath/multipath.conf.5 > > @@ -717,6 +717,22 @@ The default is: \fB600\fR > > . > > . > > .TP > > +.B eh_deadline > > +Specify the maximum number of seconds the SCSI layer will spend > > doing error > > +handling when scsi devices fail. After this timeout the scsi layer > > will perform > > +a full HBA reset. Setting this may be necessary in cases where the > > rport is > > +never lost, so \fIfast_io_fail_tmo\fR and \fIdev_loss_tmo\fR will > > never > > +trigger, but (frequently do to load) scsi commands still hang. > > \fBNote:\fR when > > +the scsi error handler performs the HBA reset, all target paths on > > that HBA > > +will be affected. eh_deadline should only be set in cases where all > > targets on > > +the affected HBAs are multipathed. > > +.RS > > +.TP > > +The default is: \fB<unset>\fR > > +.RE > > +. > > +. > > +.TP > > .B bindings_file > > The full pathname of the binding file to be used when the > > user_friendly_names > > option is set. > > -- > Dr. Martin Wilck <mwilck@xxxxxxxx>, Tel. +49 (0)911 74053 2107 > SUSE Software Solutions Germany GmbH > HRB 36809, AG Nürnberg GF: Felix > Imendörffer > -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel