Internally we had discussions about adding a synchronize_rcu here: --- a/scsi/drivers/scsi/device_handler/scsi_dh_alua.c 2020-09-10 12:29:03.000000000 -0700 +++ b/scsi/drivers/scsi/device_handler/scsi_dh_alua.c 2020-09-18 14:59:59.000000000 -0700 @@ -658,8 +658,6 @@ rcu_read_lock(); list_for_each_entry_rcu(h, &tmp_pg->dh_list, node) { - /* h->sdev should always be valid */ - BUG_ON(!h->sdev); h->sdev->access_state = desc[0]; } rcu_read_unlock(); @@ -705,7 +703,6 @@ pg->expiry = 0; rcu_read_lock(); list_for_each_entry_rcu(h, &pg->dh_list, node) { - BUG_ON(!h->sdev); h->sdev->access_state = (pg->state & SCSI_ACCESS_STATE_MASK); if (pg->pref) @@ -1147,7 +1144,6 @@ spin_lock(&h->pg_lock); pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h->pg_lock)); rcu_assign_pointer(h->pg, NULL); - h->sdev = NULL; spin_unlock(&h->pg_lock); if (pg) { spin_lock_irq(&pg->lock); @@ -1156,6 +1152,7 @@ kref_put(&pg->kref, release_port_group); } sdev->handler_data = NULL; + synchronize_rcu(); kfree(h); } We were thinking that this would allow any running readers to finish before the object is freed. Would that make sense there? From what I can tell the only RCU implementation in this kernel version in scsi is here, so I couldn’t find many examples to follow. Brian Bunker SW Eng brian@xxxxxxxxxxxxxxx > On Sep 18, 2020, at 11:41 AM, Ewan D. Milne <emilne@xxxxxxxxxx> wrote: > > On Fri, 2020-09-11 at 09:21 -0700, Brian Bunker wrote: >> A race exists where the BUG_ON(!h->sdev) will fire if the detach >> device handler >> from one thread runs removing a list entry while another thread is >> trying to >> evaluate the target portal group state. >> >> Do not set the h->sdev to NULL in the detach device handler. It is >> freed at the >> end of the function any way. Also remove the BUG_ON since the >> condition >> that causes them to fire has been removed. >> >> Signed-off-by: Brian Bunker <brian@xxxxxxxxxxxxxxx> >> Acked-by: Krishna Kant <krishna.kant@xxxxxxxxxxxxxxx> >> ___ >> --- a/scsi/drivers/scsi/device_handler/scsi_dh_alua.c 2020-09-10 >> 12:29:03.000000000 -0700 >> +++ b/scsi/drivers/scsi/device_handler/scsi_dh_alua.c 2020-09-11 >> 09:14:15.000000000 -0700 >> @@ -658,8 +658,6 @@ >> rcu_read_lock(); >> list_for_each_entry_rcu(h, >> &tmp_pg->dh_list, >> node) { >> - /* h->sdev should >> always be valid */ >> - BUG_ON(!h->sdev); >> h->sdev->access_state >> = desc[0]; >> } >> rcu_read_unlock(); >> @@ -705,7 +703,6 @@ >> pg->expiry = 0; >> rcu_read_lock(); >> list_for_each_entry_rcu(h, &pg->dh_list, >> node) { >> - BUG_ON(!h->sdev); >> h->sdev->access_state = >> (pg->state & >> SCSI_ACCESS_STATE_MASK); >> if (pg->pref) >> @@ -1147,7 +1144,6 @@ >> spin_lock(&h->pg_lock); >> pg = rcu_dereference_protected(h->pg, lockdep_is_held(&h- >>> pg_lock)); >> rcu_assign_pointer(h->pg, NULL); >> - h->sdev = NULL; >> spin_unlock(&h->pg_lock); >> if (pg) { >> spin_lock_irq(&pg->lock); >> > > We ran this change through fault insertion regression testing and > did not see any new problems. (Our tests didn't hit the original > bug being fixed here though.) > > -Ewan > >