On Fri, Feb 25, 2022 at 03:45:45AM -0500, Wen Gong wrote: > Currently ath11k has device recovery logic, it is introduced by this > patch "ath11k: Add support for subsystem recovery" which is upstream > by https://git.kernel.org/pub/scm/linux/kernel/git/kvalo/ath.git/commit/?h=ath11k-bringup&id=3a7b4838b6f6f234239f263ef3dc02e612a083ad. > > The patch is for AHB devices such as IPQ8074, it has remote proc module > which is used to download the firmware and boots the processor which > firmware is running on. If firmware crashed, remote proc module will > detect it and download and boot firmware again. Below command will > trigger a firmware crash, and then user can test feature of device > recovery. > > Test command: > echo assert > /sys/kernel/debug/ath11k/qca6390\ hw2.0/simulate_fw_crash > echo assert > /sys/kernel/debug/ath11k/wcn6855\ hw2.0/simulate_fw_crash > > Unfortunately, QCA6390 is PCIe bus, it does not have the remote proc > module, it use mhi module to communicate between firmware and ath11k. > So ath11k does not support device recovery for QCA6390 currently. > > This patch is to add the extra logic which is different for QCA6390. > When firmware crashed, MHI_CB_EE_RDDM event will be indicate by > firmware and then ath11k_mhi_op_status_cb which is the callback of > mhi_controller will receive the MHI_CB_EE_RDDM event, then ath11k > will start to do recovery process, ath11k_core_reset() calls > ath11k_hif_power_down()/ath11k_hif_power_up(), then the mhi/ath11k > will start to download and boot firmware. There are some logic to > avoid deadloop recovery and two simultaneous recovery operations. > And because it has muti-radios for the soc, so it add some logic > in ath11k_mac_op_reconfig_complete() to make sure all radios has > reconfig complete and then complete the device recovery. > > Also it add workqueue_aux, because ab->workqueue is used when receive > ATH11K_QMI_EVENT_FW_READY in recovery process(queue_work(ab->workqueue, > &ab->restart_work)), and ath11k_core_reset will wait for max > ATH11K_RESET_TIMEOUT_HZ for the previous restart_work finished, if > ath11k_core_reset also queued in ab->workqueue, then it will delay > restart_work of previous recovery and lead previous recovery fail. > > ath11k recovery success for QCA6390/WCN6855 after apply this patch. > > Tested-on: QCA6390 hw2.0 PCI WLAN.HST.1.0.1-01740-QCAHSTSWPLZ_V2_TO_X86-1 > Tested-on: WCN6855 hw2.0 PCI WLAN.HSP.1.1-03003-QCAHSPSWPL_V1_V2_SILICONZ_LITE-2 > > Signed-off-by: Wen Gong <quic_wgong@xxxxxxxxxxx> > --- > drivers/net/wireless/ath/ath11k/core.c | 70 ++++++++++++++++++++++++++ > drivers/net/wireless/ath/ath11k/core.h | 13 +++++ > drivers/net/wireless/ath/ath11k/mac.c | 18 +++++++ > drivers/net/wireless/ath/ath11k/mhi.c | 33 ++++++++++++ > 4 files changed, 134 insertions(+) > > diff --git a/drivers/net/wireless/ath/ath11k/core.c b/drivers/net/wireless/ath/ath11k/core.c > index 7c508e9baa6d..00c83fdb0702 100644 > --- a/drivers/net/wireless/ath/ath11k/core.c > +++ b/drivers/net/wireless/ath/ath11k/core.c > @@ -1342,6 +1342,65 @@ static void ath11k_core_restart(struct work_struct *work) > complete(&ab->driver_recovery); > } > > +static void ath11k_core_reset(struct work_struct *work) > +{ > + struct ath11k_base *ab = container_of(work, struct ath11k_base, reset_work); > + int reset_count, fail_cont_count; > + long time_left; > + > + if (!(test_bit(ATH11K_FLAG_REGISTERED, &ab->dev_flags))) { > + ath11k_warn(ab, "ignore reset dev flags 0x%lx\n", ab->dev_flags); > + return; > + } > + > + /* Sometimes the recovery will fail and then the next all recovery fail, > + * this is to avoid infinite recovery since it can not recovery success. > + */ > + fail_cont_count = atomic_read(&ab->fail_cont_count); > + > + if (fail_cont_count >= ATH11K_RESET_MAX_FAIL_COUNT_FINAL) > + return; > + > + if (fail_cont_count >= ATH11K_RESET_MAX_FAIL_COUNT_FIRST && > + time_before(jiffies, ab->reset_fail_timeout)) > + return; > + > + reset_count = atomic_inc_return(&ab->reset_count); > + > + if (reset_count > 1) { > + /* Sometimes it happened another reset worker before the previous one > + * completed, then the second reset worker will destroy the previous one, > + * thus below is to avoid that. > + */ > + ath11k_warn(ab, "already reseting count %d\n", reset_count); > + > + reinit_completion(&ab->reset_complete); > + time_left = wait_for_completion_timeout(&ab->reset_complete, > + ATH11K_RESET_TIMEOUT_HZ); > + > + if (time_left) { > + ath11k_dbg(ab, ATH11K_DBG_BOOT, "to skip reset\n"); > + atomic_dec(&ab->reset_count); > + return; > + } > + > + ab->reset_fail_timeout = jiffies + ATH11K_RESET_FAIL_TIMEOUT_HZ; > + /* Record the continuous recovery fail count when recovery failed*/ > + fail_cont_count = atomic_inc_return(&ab->fail_cont_count); This results in a compiler warning because fail_cont_count is not used afterwards. I would suggest to replace the statement with atomic_inc(&ab->fail_cont_count); Guenter > + } > + > + ath11k_dbg(ab, ATH11K_DBG_BOOT, "reset starting\n"); > + > + ab->is_reset = true; > + atomic_set(&ab->recovery_count, 0); > + > + ath11k_hif_power_down(ab); > + ath11k_qmi_free_resource(ab); > + ath11k_hif_power_up(ab); > + > + ath11k_dbg(ab, ATH11K_DBG_BOOT, "reset started\n"); > +} > + > static int ath11k_init_hw_params(struct ath11k_base *ab) > { > const struct ath11k_hw_params *hw_params = NULL; > @@ -1411,6 +1470,9 @@ EXPORT_SYMBOL(ath11k_core_deinit); > > void ath11k_core_free(struct ath11k_base *ab) > { > + flush_workqueue(ab->workqueue_aux); > + destroy_workqueue(ab->workqueue_aux); > + > flush_workqueue(ab->workqueue); > destroy_workqueue(ab->workqueue); > > @@ -1434,9 +1496,14 @@ struct ath11k_base *ath11k_core_alloc(struct device *dev, size_t priv_size, > if (!ab->workqueue) > goto err_sc_free; > > + ab->workqueue_aux = create_singlethread_workqueue("ath11k_aux_wq"); > + if (!ab->workqueue_aux) > + goto err_free_wq; > + > mutex_init(&ab->core_lock); > spin_lock_init(&ab->base_lock); > mutex_init(&ab->vdev_id_11d_lock); > + init_completion(&ab->reset_complete); > > INIT_LIST_HEAD(&ab->peers); > init_waitqueue_head(&ab->peer_mapping_wq); > @@ -1445,6 +1512,7 @@ struct ath11k_base *ath11k_core_alloc(struct device *dev, size_t priv_size, > INIT_WORK(&ab->restart_work, ath11k_core_restart); > INIT_WORK(&ab->update_11d_work, ath11k_update_11d); > INIT_WORK(&ab->rfkill_work, ath11k_rfkill_work); > + INIT_WORK(&ab->reset_work, ath11k_core_reset); > timer_setup(&ab->rx_replenish_retry, ath11k_ce_rx_replenish_retry, 0); > init_completion(&ab->htc_suspend); > init_completion(&ab->wow.wakeup_completed); > @@ -1455,6 +1523,8 @@ struct ath11k_base *ath11k_core_alloc(struct device *dev, size_t priv_size, > > return ab; > > +err_free_wq: > + destroy_workqueue(ab->workqueue); > err_sc_free: > kfree(ab); > return NULL; > diff --git a/drivers/net/wireless/ath/ath11k/core.h b/drivers/net/wireless/ath/ath11k/core.h > index d2fc7a7a98f4..c85301e3609b 100644 > --- a/drivers/net/wireless/ath/ath11k/core.h > +++ b/drivers/net/wireless/ath/ath11k/core.h > @@ -39,6 +39,10 @@ > extern unsigned int ath11k_frame_mode; > > #define ATH11K_MON_TIMER_INTERVAL 10 > +#define ATH11K_RESET_TIMEOUT_HZ (20 * HZ) > +#define ATH11K_RESET_MAX_FAIL_COUNT_FIRST 3 > +#define ATH11K_RESET_MAX_FAIL_COUNT_FINAL 5 > +#define ATH11K_RESET_FAIL_TIMEOUT_HZ (20 * HZ) > > enum ath11k_supported_bw { > ATH11K_BW_20 = 0, > @@ -787,6 +791,15 @@ struct ath11k_base { > struct work_struct restart_work; > struct work_struct update_11d_work; > u8 new_alpha2[3]; > + struct workqueue_struct *workqueue_aux; > + struct work_struct reset_work; > + atomic_t reset_count; > + atomic_t recovery_count; > + bool is_reset; > + struct completion reset_complete; > + /* continuous recovery fail count */ > + atomic_t fail_cont_count; > + unsigned long reset_fail_timeout; > struct { > /* protected by data_lock */ > u32 fw_crash_counter; > diff --git a/drivers/net/wireless/ath/ath11k/mac.c b/drivers/net/wireless/ath/ath11k/mac.c > index d5b83f90d27a..5c62faf359a9 100644 > --- a/drivers/net/wireless/ath/ath11k/mac.c > +++ b/drivers/net/wireless/ath/ath11k/mac.c > @@ -7881,6 +7881,8 @@ ath11k_mac_op_reconfig_complete(struct ieee80211_hw *hw, > enum ieee80211_reconfig_type reconfig_type) > { > struct ath11k *ar = hw->priv; > + struct ath11k_base *ab = ar->ab; > + int recovery_count; > > if (reconfig_type != IEEE80211_RECONFIG_TYPE_RESTART) > return; > @@ -7892,6 +7894,22 @@ ath11k_mac_op_reconfig_complete(struct ieee80211_hw *hw, > ar->pdev->pdev_id); > ar->state = ATH11K_STATE_ON; > ieee80211_wake_queues(ar->hw); > + > + if (ab->is_reset) { > + recovery_count = atomic_inc_return(&ab->recovery_count); > + ath11k_dbg(ab, ATH11K_DBG_BOOT, > + "recovery count %d\n", recovery_count); > + /* When there are multiple radios in an SOC, > + * the recovery has to be done for each radio > + */ > + if (recovery_count == ab->num_radios) { > + atomic_dec(&ab->reset_count); > + complete(&ab->reset_complete); > + ab->is_reset = false; > + atomic_set(&ab->fail_cont_count, 0); > + ath11k_dbg(ab, ATH11K_DBG_BOOT, "reset success\n"); > + } > + } > } > > mutex_unlock(&ar->conf_mutex); > diff --git a/drivers/net/wireless/ath/ath11k/mhi.c b/drivers/net/wireless/ath/ath11k/mhi.c > index fc3524e83e52..61d83be4841f 100644 > --- a/drivers/net/wireless/ath/ath11k/mhi.c > +++ b/drivers/net/wireless/ath/ath11k/mhi.c > @@ -292,15 +292,48 @@ static void ath11k_mhi_op_runtime_put(struct mhi_controller *mhi_cntrl) > { > } > > +static char *ath11k_mhi_op_callback_to_str(enum mhi_callback reason) > +{ > + switch (reason) { > + case MHI_CB_IDLE: > + return "MHI_CB_IDLE"; > + case MHI_CB_PENDING_DATA: > + return "MHI_CB_PENDING_DATA"; > + case MHI_CB_LPM_ENTER: > + return "MHI_CB_LPM_ENTER"; > + case MHI_CB_LPM_EXIT: > + return "MHI_CB_LPM_EXIT"; > + case MHI_CB_EE_RDDM: > + return "MHI_CB_EE_RDDM"; > + case MHI_CB_EE_MISSION_MODE: > + return "MHI_CB_EE_MISSION_MODE"; > + case MHI_CB_SYS_ERROR: > + return "MHI_CB_SYS_ERROR"; > + case MHI_CB_FATAL_ERROR: > + return "MHI_CB_FATAL_ERROR"; > + case MHI_CB_BW_REQ: > + return "MHI_CB_BW_REQ"; > + default: > + return "UNKNOWN"; > + } > +}; > + > static void ath11k_mhi_op_status_cb(struct mhi_controller *mhi_cntrl, > enum mhi_callback cb) > { > struct ath11k_base *ab = dev_get_drvdata(mhi_cntrl->cntrl_dev); > > + ath11k_dbg(ab, ATH11K_DBG_BOOT, "mhi notify status reason %s\n", > + ath11k_mhi_op_callback_to_str(cb)); > + > switch (cb) { > case MHI_CB_SYS_ERROR: > ath11k_warn(ab, "firmware crashed: MHI_CB_SYS_ERROR\n"); > break; > + case MHI_CB_EE_RDDM: > + if (!(test_bit(ATH11K_FLAG_UNREGISTERING, &ab->dev_flags))) > + queue_work(ab->workqueue_aux, &ab->reset_work); > + break; > default: > break; > }