Hi Ben, thanks a lot for this. I have only a few minor nitpicks (see below). I suppose you've tested this already? Regards Martin On Fri, 2018-02-09 at 23:07 -0600, Benjamin Marzinski wrote: > The current method of waiting for dmevents on multipath devices > involves > creating a seperate thread for each device. This can become very > wasteful when there are large numbers of multipath devices. Also, > since > multipathd needs to grab the vecs lock to update the devices, the > additional threads don't actually provide much parallelism. > > The patch adds a new method of updating multipath devices on > dmevents, > which uses the new device-mapper event polling interface. This means > that there is only one dmevent waiting thread which will wait for > events > on all of the multipath devices. Currently the code to get the event > number from the list of device names and to re-arm the polling > interface > is not in libdevmapper, so the patch does that work. Obviously, these > bits need to go into libdevmapper, so that multipathd can use a > standard > interface. > > I haven't touched any of the existing event waiting code, since event > polling was only added to device-mapper in version > 4.37.0. multipathd > checks this version, and defaults to using the polling code if > device-mapper supports it. This can be overridden by running > multipathd > with "-w", to force it to use the old event waiting code. Why use a command line option here rather than a config file option? > > Signed-off-by: Benjamin Marzinski <bmarzins@xxxxxxxxxx> > --- > multipathd/Makefile | 3 +- > multipathd/dmevents.c | 396 > ++++++++++++++++++++++++++++++++++++++++++++++++++ > multipathd/dmevents.h | 13 ++ > multipathd/main.c | 58 +++++++- > 4 files changed, 461 insertions(+), 9 deletions(-) > create mode 100644 multipathd/dmevents.c > create mode 100644 multipathd/dmevents.h > > diff --git a/multipathd/Makefile b/multipathd/Makefile > index 85f29a7..4c438f0 100644 > --- a/multipathd/Makefile > +++ b/multipathd/Makefile > @@ -22,7 +22,8 @@ ifdef SYSTEMD > endif > endif > > -OBJS = main.o pidfile.o uxlsnr.o uxclnt.o cli.o cli_handlers.o > waiter.o > +OBJS = main.o pidfile.o uxlsnr.o uxclnt.o cli.o cli_handlers.o > waiter.o \ > + dmevents.o > > EXEC = multipathd > > diff --git a/multipathd/dmevents.c b/multipathd/dmevents.c > new file mode 100644 > index 0000000..a56c055 > --- /dev/null > +++ b/multipathd/dmevents.c > @@ -0,0 +1,396 @@ > +/* > + * Copyright (c) 2004, 2005 Christophe Varoqui > + * Copyright (c) 2005 Kiyoshi Ueda, NEC > + * Copyright (c) 2005 Edward Goggin, EMC > + * Copyright (c) 2005, 2018 Benjamin Marzinski, Redhat > + */ > +#include <unistd.h> > +#include <libdevmapper.h> > +#include <sys/mman.h> > +#include <pthread.h> > +#include <urcu.h> > +#include <poll.h> > +#include <sys/ioctl.h> > +#include <sys/types.h> > +#include <sys/stat.h> > +#include <fcntl.h> > +#include <linux/dm-ioctl.h> > +#include <errno.h> > + > +#include "vector.h" > +#include "structs.h" > +#include "structs_vec.h" > +#include "devmapper.h" > +#include "debug.h" > +#include "dmevents.h" > + > +#ifndef DM_DEV_ARM_POLL > +#define DM_DEV_ARM_POLL _IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD + 1, > struct dm_ioctl) > +#endif > + > +enum event_actions { > + EVENT_NOTHING, > + EVENT_REMOVE, > + EVENT_UPDATE, > +}; > + > +struct dev_event { > + char name[WWID_SIZE]; > + uint32_t evt_nr; > + enum event_actions action; > +}; > + > +struct dmevent_waiter { > + int fd; > + struct vectors *vecs; > + vector events; > + pthread_mutex_t events_lock; > +}; > + > +static struct dmevent_waiter *waiter; > + > +int dmevent_poll_supported(void) > +{ > + unsigned int minv[3] = {4, 37, 0}; > + unsigned int v[3]; > + > + if (dm_drv_version(v)) > + return 0; > + > + if (VERSION_GE(v, minv)) > + return 1; > + return 0; > +} > + > + > +int alloc_dmevent_waiter(struct vectors *vecs) > +{ > + if (!vecs) { > + condlog(0, "can't create waiter structure. invalid > vectors"); > + goto fail; > + } > + waiter = (struct dmevent_waiter *)malloc(sizeof(struct > dmevent_waiter)); > + if (!waiter) { > + condlog(0, "failed to allocate waiter structure"); > + goto fail; > + } > + memset(waiter, 0, sizeof(struct dmevent_waiter)); > + waiter->events = vector_alloc(); > + if (!waiter->events) { > + condlog(0, "failed to allocate waiter events > vector"); > + goto fail_waiter; > + } > + waiter->fd = open("/dev/mapper/control", O_RDWR); > + if (waiter->fd < 0) { > + condlog(0, "failed to open /dev/mapper/control for > waiter"); > + goto fail_events; > + } > + pthread_mutex_init(&waiter->events_lock, NULL); > + waiter->vecs = vecs; > + > + return 0; > +fail_events: > + vector_free(waiter->events); > +fail_waiter: > + free(waiter); > +fail: > + waiter = NULL; > + return -1; > +} Nitpick: conventionally, an "alloc"-type function would return the pointer, and NULL on failure. > + > +void free_dmevent_waiter(void) > +{ > + struct dev_event *dev_evt; > + int i; > + > + if (!waiter) > + return; > + pthread_mutex_destroy(&waiter->events_lock); > + close(waiter->fd); > + vector_foreach_slot(waiter->events, dev_evt, i) > + free(dev_evt); > + vector_free(waiter->events); > + free(waiter); > + waiter = NULL; > +} Nitpick: Similarly, a "free" function typically takes the pointer to be freed as argument. > + > +static int arm_dm_event_poll(int fd) > +{ > + struct dm_ioctl dmi; > + memset(&dmi, 0, sizeof(dmi)); > + dmi.version[0] = DM_VERSION_MAJOR; > + dmi.version[1] = DM_VERSION_MINOR; > + dmi.version[2] = DM_VERSION_PATCHLEVEL; > + dmi.flags = 0x4; What's the meaning of this flag? I couldn't find it in dm-ioctl.h > + dmi.data_start = offsetof(struct dm_ioctl, data); > + dmi.data_size = sizeof(dmi); > + return ioctl(fd, DM_DEV_ARM_POLL, &dmi); > +} > + > +/* > + * As of version 4.37.0 device-mapper stores the event number in the > + * dm_names structure after the name, when DM_DEVICE_LIST is called > + */ > +static uint32_t dm_event_nr(struct dm_names *n) > +{ > + return *(uint32_t *)(((uintptr_t)(strchr(n->name, 0) + 1) + > 7) & ~7); > +} > + > +static int dm_get_events(void) > +{ > + struct dm_task *dmt; > + struct dm_names *names; > + struct dev_event *dev_evt; > + int i; > + > + if (!(dmt = libmp_dm_task_create(DM_DEVICE_LIST))) > + return -1; > + > + dm_task_no_open_count(dmt); > + > + if (!dm_task_run(dmt)) > + goto fail; > + > + if (!(names = dm_task_get_names(dmt))) > + goto fail; > + > + pthread_mutex_lock(&waiter->events_lock); > + vector_foreach_slot(waiter->events, dev_evt, i) > + dev_evt->action = EVENT_REMOVE; > + while (names->dev) { > + uint32_t event_nr; > + > + if (!dm_is_mpath(names->name)) > + goto next; > + > + event_nr = dm_event_nr(names); > + vector_foreach_slot(waiter->events, dev_evt, i) { > + if (!strcmp(dev_evt->name, names->name)) { > + if (event_nr != dev_evt->evt_nr) { > + dev_evt->evt_nr = event_nr; > + dev_evt->action = > EVENT_UPDATE; > + } else > + dev_evt->action = > EVENT_NOTHING; > + break; > + } > + } > +next: > + if (!names->next) > + break; > + names = (void *)names + names->next; > + } > + pthread_mutex_unlock(&waiter->events_lock); > + dm_task_destroy(dmt); > + return 0; > + > +fail: > + dm_task_destroy(dmt); > + return -1; > +} > + > +/* You must call update_multipath() after calling this function, to > + * deal with any events that came in before the device was added */ > +int watch_dmevents(char *name) > +{ > + int event_nr; > + struct dev_event *dev_evt, *old_dev_evt; > + int i; > + > + if (!dm_is_mpath(name)) { > + condlog(0, "%s: not a multipath device. can't watch > events", > + name); > + return -1; > + } > + > + if ((event_nr = dm_geteventnr(name)) < 0) > + return -1; > + > + dev_evt = (struct dev_event *)malloc(sizeof(struct > dev_event)); > + if (!dev_evt) { > + condlog(0, "%s: can't allocate event waiter > structure", name); > + return -1; > + } > + > + strncpy(dev_evt->name, name, WWID_SIZE); > + dev_evt->name[WWID_SIZE - 1] = 0; Nitpick: It might be better to use strlcpy or snprintf here. > + dev_evt->evt_nr = event_nr; > + dev_evt->action = EVENT_NOTHING; > + > + pthread_mutex_lock(&waiter->events_lock); > + vector_foreach_slot(waiter->events, old_dev_evt, i){ > + if (!strcmp(dev_evt->name, old_dev_evt->name)) { > + /* caller will be updating this device */ > + old_dev_evt->evt_nr = event_nr; > + old_dev_evt->action = EVENT_NOTHING; > + pthread_mutex_unlock(&waiter->events_lock); > + condlog(2, "%s: already waiting for events > on device", > + name); > + free(dev_evt); > + return 0; > + } > + } > + if (!vector_alloc_slot(waiter->events)) { > + pthread_mutex_unlock(&waiter->events_lock); > + free(dev_evt); > + return -1; > + } > + vector_set_slot(waiter->events, dev_evt); > + pthread_mutex_unlock(&waiter->events_lock); > + return 0; > +} > + > +void unwatch_all_dmevents(void) > +{ > + struct dev_event *dev_evt; > + int i; > + > + pthread_mutex_lock(&waiter->events_lock); > + vector_foreach_slot(waiter->events, dev_evt, i) > + free(dev_evt); > + vector_reset(waiter->events); > + pthread_mutex_unlock(&waiter->events_lock); > +} > + > +static void unwatch_dmevents(char *name) > +{ > + struct dev_event *dev_evt; > + int i; > + > + pthread_mutex_lock(&waiter->events_lock); > + vector_foreach_slot(waiter->events, dev_evt, i) { > + if (!strcmp(dev_evt->name, name)) { > + vector_del_slot(waiter->events, i); > + free(dev_evt); > + break; > + } > + } > + pthread_mutex_unlock(&waiter->events_lock); > +} > + > +/* > + * returns the reschedule delay > + * negative means *stop* > + */ > + > +/* poll, arm, update, return */ > +static int dmevent_loop (void) > +{ > + int r, i = 0; > + struct pollfd pfd; > + struct dev_event *dev_evt; > + > + pfd.fd = waiter->fd; > + pfd.events = POLLIN; > + r = poll(&pfd, 1, -1); > + if (r <= 0) { > + condlog(0, "failed polling for dm events: %s", > strerror(errno)); > + /* sleep 1s and hope things get better */ > + return 1; > + } > + > + if (arm_dm_event_poll(waiter->fd) != 0) { > + condlog(0, "Cannot re-arm event polling: %s", > strerror(errno)); > + /* sleep 1s and hope things get better */ > + return 1; > + } > + > + if (dm_get_events() != 0) { > + condlog(0, "failed getting dm events: %s", > strerror(errno)); > + /* sleep 1s and hope things get better */ > + return 1; > + } > + > + /* > + * upon event ... > + */ > + > + while (1) { > + int done = 1; > + struct dev_event curr_dev; > + struct multipath *mpp; > + > + pthread_mutex_lock(&waiter->events_lock); > + vector_foreach_slot(waiter->events, dev_evt, i) { > + if (dev_evt->action != EVENT_NOTHING) { > + curr_dev = *dev_evt; > + if (dev_evt->action == EVENT_REMOVE) > { > + vector_del_slot(waiter- > >events, i); > + free(dev_evt); > + } else > + dev_evt->action = > EVENT_NOTHING; > + done = 0; > + break; > + } > + } > + pthread_mutex_unlock(&waiter->events_lock); > + if (done) > + return 1; > + > + condlog(3, "%s: devmap event #%i", curr_dev.name, > + curr_dev.evt_nr); > + > + /* > + * event might be : > + * > + * 1) a table reload, which means our mpp structure > is > + * obsolete : refresh it through > update_multipath() > + * 2) a path failed by DM : mark as such through > + * update_multipath() > + * 3) map has gone away : stop the thread. > + * 4) a path reinstate : nothing to do > + * 5) a switch group : nothing to do > + */ > + pthread_cleanup_push(cleanup_lock, &waiter->vecs- > >lock); > + lock(&waiter->vecs->lock); > + pthread_testcancel(); > + r = 0; > + if (curr_dev.action == EVENT_REMOVE) { > + mpp = find_mp_by_alias(waiter->vecs->mpvec, > + curr_dev.name); > + if (mpp) > + remove_map(mpp, waiter->vecs, 1); > + } else > + r = update_multipath(waiter->vecs, > curr_dev.name, 1); > + lock_cleanup_pop(&waiter->vecs->lock); > + > + if (r) { > + condlog(2, "%s: stopped watching dmevents", > + curr_dev.name); > + unwatch_dmevents(curr_dev.name); > + } > + } > + condlog(0, "dmevent waiter thread unexpectedly quit"); > + return -1; /* never reach there */ > +} > + > +static void rcu_unregister(void *param) > +{ > + rcu_unregister_thread(); > +} > + > +void *wait_dmevents (void *unused) > +{ > + int r; > + > + > + if (!waiter) { > + condlog(0, "dmevents waiter not intialized"); > + return NULL; > + } > + > + pthread_cleanup_push(rcu_unregister, NULL); > + rcu_register_thread(); > + mlockall(MCL_CURRENT | MCL_FUTURE); > + > + while (1) { > + r = dmevent_loop(); > + > + if (r < 0) > + break; > + > + sleep(r); > + } > + > + pthread_cleanup_pop(1); > + return NULL; > +} > diff --git a/multipathd/dmevents.h b/multipathd/dmevents.h > new file mode 100644 > index 0000000..569e855 > --- /dev/null > +++ b/multipathd/dmevents.h > @@ -0,0 +1,13 @@ > +#ifndef _DMEVENTS_H > +#define _DMEVENTS_H > + > +#include "structs_vec.h" > + > +int dmevent_poll_supported(void); > +int alloc_dmevent_waiter(struct vectors *vecs); > +void free_dmevent_waiter(void); > +int watch_dmevents(char *name); > +void unwatch_all_dmevents(void); > +void *wait_dmevents (void *unused); > + > +#endif /* _DMEVENTS_H */ > diff --git a/multipathd/main.c b/multipathd/main.c > index 2963bde..6dabf2c 100644 > --- a/multipathd/main.c > +++ b/multipathd/main.c > @@ -82,6 +82,7 @@ static int use_watchdog; > #include "cli_handlers.h" > #include "lock.h" > #include "waiter.h" > +#include "dmevents.h" > #include "io_err_stat.h" > #include "wwids.h" > #include "../third-party/valgrind/drd.h" > @@ -108,6 +109,7 @@ int uxsock_timeout; > int verbosity; > int bindings_read_only; > int ignore_new_devs; > +int poll_dmevents = 1; > enum daemon_status running_state = DAEMON_INIT; > pid_t daemon_pid; > pthread_mutex_t config_lock = PTHREAD_MUTEX_INITIALIZER; > @@ -288,11 +290,23 @@ switch_pathgroup (struct multipath * mpp) > mpp->alias, mpp->bestpg); > } > > +static int > +wait_for_events(struct multipath *mpp, struct vectors *vecs) > +{ > + if (poll_dmevents) > + return watch_dmevents(mpp->alias); > + else > + return start_waiter_thread(mpp, vecs); > +} > + > static void > remove_map_and_stop_waiter(struct multipath *mpp, struct vectors > *vecs, > int purge_vec) > { > - stop_waiter_thread(mpp, vecs); > + /* devices are automatically removed by the dmevent polling > code, > + * so they don't need to be manually removed here */ > + if (!poll_dmevents) > + stop_waiter_thread(mpp, vecs); > remove_map(mpp, vecs, purge_vec); > } > > @@ -305,8 +319,12 @@ remove_maps_and_stop_waiters(struct vectors > *vecs) > if (!vecs) > return; > > - vector_foreach_slot(vecs->mpvec, mpp, i) > - stop_waiter_thread(mpp, vecs); > + if (!poll_dmevents) { > + vector_foreach_slot(vecs->mpvec, mpp, i) > + stop_waiter_thread(mpp, vecs); > + } > + else > + unwatch_all_dmevents(); > > remove_maps(vecs); > } > @@ -351,7 +369,7 @@ retry: > dm_lib_release(); > > fail: > - if (new_map && (retries < 0 || start_waiter_thread(mpp, > vecs))) { > + if (new_map && (retries < 0 || wait_for_events(mpp, vecs))) > { > condlog(0, "%s: failed to create new map", mpp- > >alias); > remove_map(mpp, vecs, 1); > return 1; > @@ -870,7 +888,7 @@ retry: > > if ((mpp->action == ACT_CREATE || > (mpp->action == ACT_NOTHING && start_waiter && !mpp- > >waiter)) && > - start_waiter_thread(mpp, vecs)) > + wait_for_events(mpp, vecs)) > goto fail_map; > > /* > @@ -2173,7 +2191,7 @@ configure (struct vectors * vecs) > * start dm event waiter threads for these new maps > */ > vector_foreach_slot(vecs->mpvec, mpp, i) { > - if (start_waiter_thread(mpp, vecs)) { > + if (wait_for_events(mpp, vecs)) { > remove_map(mpp, vecs, 1); > i--; > continue; > @@ -2414,7 +2432,7 @@ set_oom_adj (void) > static int > child (void * param) > { > - pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr; > + pthread_t check_thr, uevent_thr, uxlsnr_thr, uevq_thr, > dmevent_thr; > pthread_attr_t log_attr, misc_attr, uevent_attr; > struct vectors * vecs; > struct multipath * mpp; > @@ -2476,6 +2494,8 @@ child (void * param) > goto failed; > } > > + if (poll_dmevents) > + poll_dmevents = dmevent_poll_supported(); > setlogmask(LOG_UPTO(conf->verbosity + 3)); > > envp = getenv("LimitNOFILE"); > @@ -2542,6 +2562,19 @@ child (void * param) > > init_path_check_interval(vecs); > > + if (poll_dmevents) { > + if (alloc_dmevent_waiter(vecs)) { > + condlog(0, "failed to allocate dmevents > waiter info"); > + goto failed; > + } > + if ((rc = pthread_create(&dmevent_thr, &misc_attr, > + wait_dmevents, NULL))) { > + condlog(0, "failed to create dmevent waiter > thread: %d", > + rc); > + goto failed; > + } > + } > + > /* > * Start uevent listener early to catch events > */ > @@ -2615,11 +2648,15 @@ child (void * param) > pthread_cancel(uevent_thr); > pthread_cancel(uxlsnr_thr); > pthread_cancel(uevq_thr); > + if (poll_dmevents) > + pthread_cancel(dmevent_thr); > > pthread_join(check_thr, NULL); > pthread_join(uevent_thr, NULL); > pthread_join(uxlsnr_thr, NULL); > pthread_join(uevq_thr, NULL); > + if (poll_dmevents) > + pthread_join(dmevent_thr, NULL); > > stop_io_err_stat_thread(); > > @@ -2634,6 +2671,8 @@ child (void * param) > > cleanup_checkers(); > cleanup_prio(); > + if (poll_dmevents) > + free_dmevent_waiter(); > > dm_lib_release(); > dm_lib_exit(); > @@ -2765,7 +2804,7 @@ main (int argc, char *argv[]) > udev = udev_new(); > libmp_udev_set_sync_support(0); > > - while ((arg = getopt(argc, argv, ":dsv:k::Bn")) != EOF ) { > + while ((arg = getopt(argc, argv, ":dsv:k::Bniw")) != EOF ) { > switch(arg) { > case 'd': > foreground = 1; > @@ -2799,6 +2838,9 @@ main (int argc, char *argv[]) > case 'n': > ignore_new_devs = 1; > break; > + case 'w': > + poll_dmevents = 0; > + break; > default: > fprintf(stderr, "Invalid argument '-%c'\n", > optopt); -- Dr. Martin Wilck <mwilck@xxxxxxxx>, Tel. +49 (0)911 74053 2107 SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton HRB 21284 (AG Nürnberg) -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel