This is a note to let you know that I've just added the patch titled accel/habanalabs: fix EQ heartbeat mechanism to the 6.7-stable tree which can be found at: http://www.kernel.org/git/?p=linux/kernel/git/stable/stable-queue.git;a=summary The filename of the patch is: accel-habanalabs-fix-eq-heartbeat-mechanism.patch and it can be found in the queue-6.7 subdirectory. If you, or anyone else, feels it should not be added to the stable tree, please let <stable@xxxxxxxxxxxxxxx> know about it. commit a171f44f33338be6fa6ef814a8f2735b0aa3bf04 Author: Farah Kassabri <fkassabri@xxxxxxxxx> Date: Tue Oct 31 12:20:36 2023 +0200 accel/habanalabs: fix EQ heartbeat mechanism [ Upstream commit d1958dce5ab6a3e089c60cf474e8c9b7e96e70ad ] Stop rescheduling another heartbeat check when EQ heartbeat check fails as it generates confusing logs in dmesg that the heartbeat fails. Signed-off-by: Farah Kassabri <fkassabri@xxxxxxxxx> Reviewed-by: Oded Gabbay <ogabbay@xxxxxxxxxx> Signed-off-by: Oded Gabbay <ogabbay@xxxxxxxxxx> Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx> diff --git a/drivers/accel/habanalabs/common/device.c b/drivers/accel/habanalabs/common/device.c index 9e461c03e705..9290d4374551 100644 --- a/drivers/accel/habanalabs/common/device.c +++ b/drivers/accel/habanalabs/common/device.c @@ -1044,18 +1044,19 @@ static bool is_pci_link_healthy(struct hl_device *hdev) return (vendor_id == PCI_VENDOR_ID_HABANALABS); } -static void hl_device_eq_heartbeat(struct hl_device *hdev) +static int hl_device_eq_heartbeat_check(struct hl_device *hdev) { - u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE; struct asic_fixed_properties *prop = &hdev->asic_prop; if (!prop->cpucp_info.eq_health_check_supported) - return; + return 0; if (hdev->eq_heartbeat_received) hdev->eq_heartbeat_received = false; else - hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask); + return -EIO; + + return 0; } static void hl_device_heartbeat(struct work_struct *work) @@ -1072,10 +1073,9 @@ static void hl_device_heartbeat(struct work_struct *work) /* * For EQ health check need to check if driver received the heartbeat eq event * in order to validate the eq is working. + * Only if both the EQ is healthy and we managed to send the next heartbeat reschedule. */ - hl_device_eq_heartbeat(hdev); - - if (!hdev->asic_funcs->send_heartbeat(hdev)) + if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev))) goto reschedule; if (hl_device_operational(hdev, NULL))