On Wed, 2022-05-25 at 14:07 +0200, Hannes Reinecke wrote: > On 5/25/22 13:20, Martin Wilck wrote: > > On Wed, 2022-05-25 at 10:11 +0200, Hannes Reinecke wrote: > > > When ALUA transitioning timeout triggers the path group state > > > must > > > be considered invalid. So add a new flag ALUA_PG_FAILED to > > > indicate > > > that the path state isn't necessarily valid, and keep the > > > existing > > > path state until we get a valid response from a RTPG. > > > > > > Cc: Brian Bunker <brian@xxxxxxxxxxxxxxx> > > > Cc: Martin Wilck <mwilck@xxxxxxx> > > > Signed-off-by: Hannes Reinecke <hare@xxxxxxx> > > > --- > > > drivers/scsi/device_handler/scsi_dh_alua.c | 24 +++++++-------- > > > ----- > > > -- > > > 1 file changed, 7 insertions(+), 17 deletions(-) > > > > > > diff --git a/drivers/scsi/device_handler/scsi_dh_alua.c > > > b/drivers/scsi/device_handler/scsi_dh_alua.c > > > index 1d9be771f3ee..6921490a5e65 100644 > > > --- a/drivers/scsi/device_handler/scsi_dh_alua.c > > > +++ b/drivers/scsi/device_handler/scsi_dh_alua.c > > > @@ -49,6 +49,7 @@ > > > #define ALUA_PG_RUN_RTPG 0x10 > > > #define ALUA_PG_RUN_STPG 0x20 > > > #define ALUA_PG_RUNNING 0x40 > > > +#define ALUA_PG_FAILED 0x80 > > > > > > static uint optimize_stpg; > > > module_param(optimize_stpg, uint, S_IRUGO|S_IWUSR); > > > @@ -420,7 +421,7 @@ static enum scsi_disposition > > > alua_check_sense(struct scsi_device *sdev, > > > */ > > > rcu_read_lock(); > > > pg = rcu_dereference(h->pg); > > > - if (pg) > > > + if (pg && !(pg->flags & ALUA_PG_FAILED)) > > > pg->state = > > > SCSI_ACCESS_STATE_TRANSITIONING; > > > rcu_read_unlock(); > > > alua_check(sdev, false); > > > > You still return NEEDS_RETRY from alua_check_sense(), even if > > ALUA_PG_FAILED is set? > > > > > @@ -694,7 +695,7 @@ static int alua_rtpg(struct scsi_device > > > *sdev, > > > struct alua_port_group *pg) > > > > > > skip_rtpg: > > > spin_lock_irqsave(&pg->lock, flags); > > > - if (transitioning_sense) > > > + if (transitioning_sense && !(pg->flags & ALUA_PG_FAILED)) > > > pg->state = SCSI_ACCESS_STATE_TRANSITIONING; > > > > > > if (group_id_old != pg->group_id || state_old != pg- > > > >state || > > > @@ -718,23 +719,10 @@ static int alua_rtpg(struct scsi_device > > > *sdev, > > > struct alua_port_group *pg) > > > pg->interval = ALUA_RTPG_RETRY_DELAY; > > > err = SCSI_DH_RETRY; > > > } else { > > > - struct alua_dh_data *h; > > > - > > > - /* Transitioning time exceeded, set port > > > to > > > standby */ > > > + /* Transitioning time exceeded, mark pg > > > as > > > failed */ > > > err = SCSI_DH_IO; > > > - pg->state = SCSI_ACCESS_STATE_STANDBY; > > > + pg->flags |= ALUA_PG_FAILED; > > > pg->expiry = 0; > > > - rcu_read_lock(); > > > - list_for_each_entry_rcu(h, &pg->dh_list, > > > node) { > > > - if (!h->sdev) > > > - continue; > > > - h->sdev->access_state = > > > - (pg->state & > > > SCSI_ACCESS_STATE_MASK); > > > - if (pg->pref) > > > - h->sdev->access_state |= > > > - > > > SCSI_ACCESS_STATE > > > _PREFE > > > RRED; > > > - } > > > - rcu_read_unlock(); > > > } > > > break; > > > case SCSI_ACCESS_STATE_OFFLINE: > > > @@ -746,6 +734,8 @@ static int alua_rtpg(struct scsi_device > > > *sdev, > > > struct alua_port_group *pg) > > > /* Useable path if active */ > > > err = SCSI_DH_OK; > > > pg->expiry = 0; > > > + /* RTPG succeeded, clear ALUA_PG_FAILED */ > > > + pg->flags &= ~ALUA_PG_FAILED; > > > > Shouldn't this be done for any state except TRANSITIONING? > > > Why, but it does. > We're only entering this block if the state is not TRANSITIONING. > (It's part of a 'switch' statement) Right, the only exception is SCSI_ACCESS_STATE_OFFLINE, for which the ALUA_PG_FAILED state should of cause persist. Sorry for bothering. Martin