> On 16 Oct 2018, at 12:58, Håkon Bugge <haakon.bugge@xxxxxxxxxx> wrote: > > On 12 Oct 2018, at 19:13, Håkon Bugge <haakon.bugge@xxxxxxxxxx> wrote: >> >> In the default provider, acmp, the port's sa_dest.refcnt is >> initialized to one in acmp_init_dest(). When a port associated with >> the device is added, the refcnt is set once again to one in >> acmp_port_up(). >> >> For a system with VFs enabled, the ports for both the PF and the VFs >> are added. Here an extract from the log file on a system with a single >> VF enabled: >> >> acmp_port_up: mlx4_0 1 >> acmp_port_up: mlx4_0 1 is up >> acmp_port_up: mlx4_0 1 >> acmp_port_up: mlx4_0 1 is up >> >> Now, when/if this ports goes down, acmp_port_down() will be called. It >> decrements the refcnt and waits endlessly until it becomes zero. For >> the first call to acmp_port_down(), it becomes zero, but the >> subsequent call will loop forever in said busy-waiting loop, due to >> the refcnt becoming minus one. >> >> Fixed by changing the atomic_set() to atomic_inc() in acmp_port_up() >> and test for one instead of the while loop in acmp_port_down(). >> >> Also added printing of instance number to ease debugging. >> >> With the fix, the extract from the log file yields: >> >> acmp_port_up: mlx4_0 1 >> acmp_port_up: mlx4_0 1 1 is up >> acmp_port_up: mlx4_0 1 >> acmp_port_up: mlx4_0 1 2 is up >> acmp_port_down: mlx4_0 1 >> acmp_port_down: mlx4_0 1 2 is down >> acmp_port_down: mlx4_0 1 >> acmp_port_down: mlx4_0 1 1 is down >> >> Signed-off-by: Håkon Bugge <haakon.bugge@xxxxxxxxxx> >> --- >> ibacm/prov/acmp/src/acmp.c | 21 ++++++++++++--------- >> 1 file changed, 12 insertions(+), 9 deletions(-) >> >> diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c >> index 0324ae2f..76c52fcb 100644 >> --- a/ibacm/prov/acmp/src/acmp.c >> +++ b/ibacm/prov/acmp/src/acmp.c >> @@ -2618,6 +2618,7 @@ static void acmp_port_up(struct acmp_port *port) >> __be16 pkey_be; >> __be16 sm_lid; >> int i, ret; >> + int instance; >> >> acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num); >> ret = ibv_query_port(port->dev->verbs, port->port_num, &attr); >> @@ -2643,7 +2644,7 @@ static void acmp_port_up(struct acmp_port *port) >> acmp_set_dest_addr(&port->sa_dest, ACM_ADDRESS_LID, >> (uint8_t *) &sm_lid, sizeof(sm_lid)); >> >> - atomic_set(&port->sa_dest.refcnt, 1); >> + instance = atomic_inc(&port->sa_dest.refcnt) - 1; >> port->sa_dest.state = ACMP_READY; >> for (i = 0; i < attr.pkey_tbl_len; i++) { >> ret = ibv_query_pkey(port->dev->verbs, port->port_num, i, &pkey_be); >> @@ -2663,11 +2664,13 @@ static void acmp_port_up(struct acmp_port *port) >> } >> >> port->state = IBV_PORT_ACTIVE; >> - acm_log(1, "%s %d is up\n", port->dev->verbs->device->name, port->port_num); >> + acm_log(1, "%s %d %d is up\n", port->dev->verbs->device->name, port->port_num, instance); >> } >> >> static void acmp_port_down(struct acmp_port *port) >> { >> + int instance; >> + >> acm_log(1, "%s %d\n", port->dev->verbs->device->name, port->port_num); >> pthread_mutex_lock(&port->lock); >> port->state = IBV_PORT_DOWN; >> @@ -2678,13 +2681,13 @@ static void acmp_port_down(struct acmp_port *port) >> * event instead of a sleep loop, but it's not worth it given how >> * infrequently we should be processing a port down event in practice. >> */ >> - atomic_dec(&port->sa_dest.refcnt); >> - while (atomic_get(&port->sa_dest.refcnt)) >> - sleep(0); >> - pthread_mutex_lock(&port->sa_dest.lock); >> - port->sa_dest.state = ACMP_INIT; >> - pthread_mutex_unlock(&port->sa_dest.lock); >> - acm_log(1, "%s %d is down\n", port->dev->verbs->device->name, port->port_num); >> + instance = atomic_dec(&port->sa_dest.refcnt); >> + if (instance == 1) { >> + pthread_mutex_lock(&port->sa_dest.lock); >> + port->sa_dest.state = ACMP_INIT; >> + pthread_mutex_unlock(&port->sa_dest.lock); >> + } >> + acm_log(1, "%s %d %d is down\n", port->dev->verbs->device->name, port->port_num, instance); >> } >> >> static int acmp_open_port(const struct acm_port *cport, void *dev_context, >> -- >> 2.14.3 >> > > Just made a github pull request out of this. Anything more required from my side on: https://github.com/linux-rdma/rdma-core/pull/400 ? Thxs, Håkon