Can someone help review it? Thank you! 在 2024/5/30 17:45, mengfanhui 写道: > If DCMD timeout not handled, the next interaction between the driver and firmware will still > result in DCMD timeout, which may cause system crashes or hang up > > This patch will do proper error handling for DCMD command > for Fusion adapters: > > 1. What action needs to be taken in case of DCMD timeout is decided by > function dcmd_timeout_ocr_possible(). DCMD timeout causing OCR is > applicable to the following situation: > INITIATE_OCR > KILL_ADAPTER > IGNORE_TIMEOUT > > 2. If those DCMDs fail, driver bails out. > > Error log: > [ 201.689759] megaraid_sas 0001:05:00.0: megasas_sync_pd_seq_num DCMD timed out, continue without JBOD sequence map > [ 242.649061] [] megasas_init+0x114/0x4000 [megaraid_sas] > [ 363.481009] [] megasas_issue_blocked_cmd+0x1d8/0x268 [megaraid_sas] > [ 363.481159] [] megasas_get_pd_list+0x548/0x688 [megaraid_sas] > [ 363.481309] [] megasas_init_fw+0xb38/0x1104 [megaraid_sas] > [ 363.481459] [] megasas_probe_one+0x1f4/0x5c4 [megaraid_sas] > [ 363.482419] [] megasas_init+0x114/0x4000 [megaraid_sas] > [ 381.912298] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x2010100) is timed out, func:megasas_issue_blocked_cmd > [ 381.912979] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_get_pd_list 4727 > [ 484.313526] [] megasas_init+0x114/0x4000 [megaraid_sas] > [ 562.136294] megaraid_sas 0001:05:00.0: DCMD(opcode: 0x3010100) is timed out, func:megasas_issue_blocked_cmd > [ 562.137074] megaraid_sas 0001:05:00.0: Ignore DCMD timeout: megasas_ld_list_query 4973 > [ 562.137081] megaraid_sas 0001:05:00.0: failed to get LD list > [ 562.137425] megaraid_sas 0001:05:00.0: megasas_init_fw: megasas_get_device_list failed > [ 562.137767] megaraid_sas 0001:05:00.0: megasas_disable_intr_fusion is called outbound_intr_mask:0x40000009 > [ 562.139232] megaraid_sas 0001:05:00.0: Failed from megasas_init_fw 6572 > > Co-developed-by: Jackie Liu <liuyun01@xxxxxxxxxx> > Signed-off-by: Jackie Liu <liuyun01@xxxxxxxxxx> > Signed-off-by: mengfanhui <mengfanhui@xxxxxxxxxx> > Suggested-by: Geliang Tang <geliang@xxxxxxxxxx> > --- > drivers/scsi/megaraid/megaraid_sas.h | 1 + > drivers/scsi/megaraid/megaraid_sas_base.c | 4 +- > drivers/scsi/megaraid/megaraid_sas_fusion.c | 71 +++++++++++++++++---- > 3 files changed, 62 insertions(+), 14 deletions(-) > > diff --git a/drivers/scsi/megaraid/megaraid_sas.h b/drivers/scsi/megaraid/megaraid_sas.h > index 5680c6cdb221..91570c5e8456 100644 > --- a/drivers/scsi/megaraid/megaraid_sas.h > +++ b/drivers/scsi/megaraid/megaraid_sas.h > @@ -2760,5 +2760,6 @@ void megasas_exit_debugfs(void); > void megasas_setup_debugfs(struct megasas_instance *instance); > void megasas_destroy_debugfs(struct megasas_instance *instance); > int megasas_blk_mq_poll(struct Scsi_Host *shost, unsigned int queue_num); > +int dcmd_timeout_ocr_possible(struct megasas_instance *instance); > > #endif /*LSI_MEGARAID_SAS_H */ > diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c > index 170b38f04655..ba8061ea2078 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_base.c > +++ b/drivers/scsi/megaraid/megaraid_sas_base.c > @@ -4518,8 +4518,8 @@ int megasas_alloc_cmds(struct megasas_instance *instance) > * Return 0 for only Fusion adapter, if driver load/unload is not in progress > * or FW is not under OCR. > */ > -inline int > -dcmd_timeout_ocr_possible(struct megasas_instance *instance) { > +int dcmd_timeout_ocr_possible(struct megasas_instance *instance) > +{ > > if (instance->adapter_type == MFI_SERIES) > return KILL_ADAPTER; > diff --git a/drivers/scsi/megaraid/megaraid_sas_fusion.c b/drivers/scsi/megaraid/megaraid_sas_fusion.c > index 6c1fb8149553..f0aeb1ee83a2 100644 > --- a/drivers/scsi/megaraid/megaraid_sas_fusion.c > +++ b/drivers/scsi/megaraid/megaraid_sas_fusion.c > @@ -1363,17 +1363,42 @@ megasas_sync_pd_seq_num(struct megasas_instance *instance, bool pend) { > "driver supports max %d JBOD, but FW reports %d\n", > MAX_PHYSICAL_DEVICES, le32_to_cpu(pd_sync->count)); > ret = -EINVAL; > + goto out; > } > > - if (ret == DCMD_TIMEOUT) > - dev_warn(&instance->pdev->dev, > - "%s DCMD timed out, continue without JBOD sequence map\n", > - __func__); > - > - if (ret == DCMD_SUCCESS) > + switch (ret) { > + case DCMD_SUCCESS: > instance->pd_seq_map_id++; > + break; > + case DCMD_TIMEOUT: > + switch (dcmd_timeout_ocr_possible(instance)) { > + case INITIATE_OCR: > + cmd->flags |= DRV_DCMD_SKIP_REFIRE; > + mutex_unlock(&instance->reset_mutex); > + megasas_reset_fusion(instance->host, > + MFI_IO_TIMEOUT_OCR); > + mutex_lock(&instance->reset_mutex); > + break; > + case KILL_ADAPTER: > + megaraid_sas_kill_hba(instance); > + break; > + case IGNORE_TIMEOUT: > + dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n", > + __func__, __LINE__); > + break; > + } > + break; > + case DCMD_FAILED: > + dev_err(&instance->pdev->dev, > + "%s: MR_DCMD_SYSTEM_PD_MAP_GET_INFO failed\n", > + __func__); > + break; > + } > + > +out: > + if (ret != DCMD_TIMEOUT) > + megasas_return_cmd(instance, cmd); > > - megasas_return_cmd(instance, cmd); > return ret; > } > > @@ -1449,12 +1474,34 @@ megasas_get_ld_map_info(struct megasas_instance *instance) > else > ret = megasas_issue_polled(instance, cmd); > > - if (ret == DCMD_TIMEOUT) > - dev_warn(&instance->pdev->dev, > - "%s DCMD timed out, RAID map is disabled\n", > - __func__); > + switch (ret) { > + case DCMD_TIMEOUT: > + switch (dcmd_timeout_ocr_possible(instance)) { > + case INITIATE_OCR: > + cmd->flags |= DRV_DCMD_SKIP_REFIRE; > + mutex_unlock(&instance->reset_mutex); > + megasas_reset_fusion(instance->host, > + MFI_IO_TIMEOUT_OCR); > + mutex_lock(&instance->reset_mutex); > + break; > + case KILL_ADAPTER: > + megaraid_sas_kill_hba(instance); > + break; > + case IGNORE_TIMEOUT: > + dev_info(&instance->pdev->dev, "Ignore DCMD timeout: %s %d\n", > + __func__, __LINE__); > + break; > + } > + break; > + case DCMD_FAILED: > + dev_err(&instance->pdev->dev, > + "%s: MR_DCMD_LD_MAP_GET_INFO failed\n", > + __func__); > + break; > + } > > - megasas_return_cmd(instance, cmd); > + if (ret != DCMD_TIMEOUT) > + megasas_return_cmd(instance, cmd); > > return ret; > }