Hi André, kernel test robot noticed the following build errors: [auto build test ERROR on linus/master] [also build test ERROR on v6.13-rc3 next-20241220] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/drm-amdgpu-Use-device-wedged-event/20241213-031134 base: linus/master patch link: https://lore.kernel.org/r/20241212190909.28559-2-andrealmeid%40igalia.com patch subject: [PATCH 1/1] drm/amdgpu: Use device wedged event config: arm-randconfig-001-20241220 (https://download.01.org/0day-ci/archive/20241220/202412202104.IwpOz5t5-lkp@xxxxxxxxx/config) compiler: clang version 19.1.3 (https://github.com/llvm/llvm-project ab51eccf88f5321e7c60591c5546b254b6afab99) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241220/202412202104.IwpOz5t5-lkp@xxxxxxxxx/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@xxxxxxxxx> | Closes: https://lore.kernel.org/oe-kbuild-all/202412202104.IwpOz5t5-lkp@xxxxxxxxx/ All errors (new ones prefixed by >>): In file included from drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:35: In file included from include/linux/iommu.h:10: In file included from include/linux/scatterlist.h:8: In file included from include/linux/mm.h:2223: include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion] 518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" | ~~~~~~~~~~~ ^ ~~~ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6061:2: error: call to undeclared function 'drm_dev_wedged_event'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration] 6061 | drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); | ^ >> drivers/gpu/drm/amd/amdgpu/amdgpu_device.c:6061:42: error: use of undeclared identifier 'DRM_WEDGE_RECOVERY_NONE' 6061 | drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); | ^ 1 warning and 2 errors generated. vim +/drm_dev_wedged_event +6061 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 5794 5795 /** 5796 * amdgpu_device_gpu_recover - reset the asic and recover scheduler 5797 * 5798 * @adev: amdgpu_device pointer 5799 * @job: which job trigger hang 5800 * @reset_context: amdgpu reset context pointer 5801 * 5802 * Attempt to reset the GPU if it has hung (all asics). 5803 * Attempt to do soft-reset or full-reset and reinitialize Asic 5804 * Returns 0 for success or an error on failure. 5805 */ 5806 5807 int amdgpu_device_gpu_recover(struct amdgpu_device *adev, 5808 struct amdgpu_job *job, 5809 struct amdgpu_reset_context *reset_context) 5810 { 5811 struct list_head device_list, *device_list_handle = NULL; 5812 bool job_signaled = false; 5813 struct amdgpu_hive_info *hive = NULL; 5814 struct amdgpu_device *tmp_adev = NULL; 5815 int i, r = 0; 5816 bool need_emergency_restart = false; 5817 bool audio_suspended = false; 5818 int retry_limit = AMDGPU_MAX_RETRY_LIMIT; 5819 5820 /* 5821 * Special case: RAS triggered and full reset isn't supported 5822 */ 5823 need_emergency_restart = amdgpu_ras_need_emergency_restart(adev); 5824 5825 /* 5826 * Flush RAM to disk so that after reboot 5827 * the user can read log and see why the system rebooted. 5828 */ 5829 if (need_emergency_restart && amdgpu_ras_get_context(adev) && 5830 amdgpu_ras_get_context(adev)->reboot) { 5831 DRM_WARN("Emergency reboot."); 5832 5833 ksys_sync_helper(); 5834 emergency_restart(); 5835 } 5836 5837 dev_info(adev->dev, "GPU %s begin!\n", 5838 need_emergency_restart ? "jobs stop":"reset"); 5839 5840 if (!amdgpu_sriov_vf(adev)) 5841 hive = amdgpu_get_xgmi_hive(adev); 5842 if (hive) 5843 mutex_lock(&hive->hive_lock); 5844 5845 reset_context->job = job; 5846 reset_context->hive = hive; 5847 /* 5848 * Build list of devices to reset. 5849 * In case we are in XGMI hive mode, resort the device list 5850 * to put adev in the 1st position. 5851 */ 5852 INIT_LIST_HEAD(&device_list); 5853 if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1) && hive) { 5854 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { 5855 list_add_tail(&tmp_adev->reset_list, &device_list); 5856 if (adev->shutdown) 5857 tmp_adev->shutdown = true; 5858 } 5859 if (!list_is_first(&adev->reset_list, &device_list)) 5860 list_rotate_to_front(&adev->reset_list, &device_list); 5861 device_list_handle = &device_list; 5862 } else { 5863 list_add_tail(&adev->reset_list, &device_list); 5864 device_list_handle = &device_list; 5865 } 5866 5867 if (!amdgpu_sriov_vf(adev)) { 5868 r = amdgpu_device_health_check(device_list_handle); 5869 if (r) 5870 goto end_reset; 5871 } 5872 5873 /* We need to lock reset domain only once both for XGMI and single device */ 5874 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 5875 reset_list); 5876 amdgpu_device_lock_reset_domain(tmp_adev->reset_domain); 5877 5878 /* block all schedulers and reset given job's ring */ 5879 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5880 5881 amdgpu_device_set_mp1_state(tmp_adev); 5882 5883 /* 5884 * Try to put the audio codec into suspend state 5885 * before gpu reset started. 5886 * 5887 * Due to the power domain of the graphics device 5888 * is shared with AZ power domain. Without this, 5889 * we may change the audio hardware from behind 5890 * the audio driver's back. That will trigger 5891 * some audio codec errors. 5892 */ 5893 if (!amdgpu_device_suspend_display_audio(tmp_adev)) 5894 audio_suspended = true; 5895 5896 amdgpu_ras_set_error_query_ready(tmp_adev, false); 5897 5898 cancel_delayed_work_sync(&tmp_adev->delayed_init_work); 5899 5900 amdgpu_amdkfd_pre_reset(tmp_adev, reset_context); 5901 5902 /* 5903 * Mark these ASICs to be reseted as untracked first 5904 * And add them back after reset completed 5905 */ 5906 amdgpu_unregister_gpu_instance(tmp_adev); 5907 5908 drm_client_dev_suspend(adev_to_drm(tmp_adev), false); 5909 5910 /* disable ras on ALL IPs */ 5911 if (!need_emergency_restart && 5912 amdgpu_device_ip_need_full_reset(tmp_adev)) 5913 amdgpu_ras_suspend(tmp_adev); 5914 5915 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5916 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5917 5918 if (!amdgpu_ring_sched_ready(ring)) 5919 continue; 5920 5921 drm_sched_stop(&ring->sched, job ? &job->base : NULL); 5922 5923 if (need_emergency_restart) 5924 amdgpu_job_stop_all_jobs_on_sched(&ring->sched); 5925 } 5926 atomic_inc(&tmp_adev->gpu_reset_counter); 5927 } 5928 5929 if (need_emergency_restart) 5930 goto skip_sched_resume; 5931 5932 /* 5933 * Must check guilty signal here since after this point all old 5934 * HW fences are force signaled. 5935 * 5936 * job->base holds a reference to parent fence 5937 */ 5938 if (job && dma_fence_is_signaled(&job->hw_fence)) { 5939 job_signaled = true; 5940 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset"); 5941 goto skip_hw_reset; 5942 } 5943 5944 retry: /* Rest of adevs pre asic reset from XGMI hive. */ 5945 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5946 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); 5947 /*TODO Should we stop ?*/ 5948 if (r) { 5949 dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ", 5950 r, adev_to_drm(tmp_adev)->unique); 5951 tmp_adev->asic_reset_res = r; 5952 } 5953 } 5954 5955 /* Actual ASIC resets if needed.*/ 5956 /* Host driver will handle XGMI hive reset for SRIOV */ 5957 if (amdgpu_sriov_vf(adev)) { 5958 if (amdgpu_ras_get_fed_status(adev) || amdgpu_virt_rcvd_ras_interrupt(adev)) { 5959 dev_dbg(adev->dev, "Detected RAS error, wait for FLR completion\n"); 5960 amdgpu_ras_set_fed(adev, true); 5961 set_bit(AMDGPU_HOST_FLR, &reset_context->flags); 5962 } 5963 5964 r = amdgpu_device_reset_sriov(adev, reset_context); 5965 if (AMDGPU_RETRY_SRIOV_RESET(r) && (retry_limit--) > 0) { 5966 amdgpu_virt_release_full_gpu(adev, true); 5967 goto retry; 5968 } 5969 if (r) 5970 adev->asic_reset_res = r; 5971 } else { 5972 r = amdgpu_do_asic_reset(device_list_handle, reset_context); 5973 if (r && r == -EAGAIN) 5974 goto retry; 5975 } 5976 5977 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5978 /* 5979 * Drop any pending non scheduler resets queued before reset is done. 5980 * Any reset scheduled after this point would be valid. Scheduler resets 5981 * were already dropped during drm_sched_stop and no new ones can come 5982 * in before drm_sched_start. 5983 */ 5984 amdgpu_device_stop_pending_resets(tmp_adev); 5985 } 5986 5987 skip_hw_reset: 5988 5989 /* Post ASIC reset for all devs .*/ 5990 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 5991 5992 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { 5993 struct amdgpu_ring *ring = tmp_adev->rings[i]; 5994 5995 if (!amdgpu_ring_sched_ready(ring)) 5996 continue; 5997 5998 drm_sched_start(&ring->sched, 0); 5999 } 6000 6001 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) 6002 drm_helper_resume_force_mode(adev_to_drm(tmp_adev)); 6003 6004 if (tmp_adev->asic_reset_res) 6005 r = tmp_adev->asic_reset_res; 6006 6007 tmp_adev->asic_reset_res = 0; 6008 6009 if (r) { 6010 /* bad news, how to tell it to userspace ? 6011 * for ras error, we should report GPU bad status instead of 6012 * reset failure 6013 */ 6014 if (reset_context->src != AMDGPU_RESET_SRC_RAS || 6015 !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) 6016 dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", 6017 atomic_read(&tmp_adev->gpu_reset_counter)); 6018 amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); 6019 } else { 6020 dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); 6021 if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0)) 6022 DRM_WARN("smart shift update failed\n"); 6023 } 6024 } 6025 6026 skip_sched_resume: 6027 list_for_each_entry(tmp_adev, device_list_handle, reset_list) { 6028 /* unlock kfd: SRIOV would do it separately */ 6029 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev)) 6030 amdgpu_amdkfd_post_reset(tmp_adev); 6031 6032 /* kfd_post_reset will do nothing if kfd device is not initialized, 6033 * need to bring up kfd here if it's not be initialized before 6034 */ 6035 if (!adev->kfd.init_complete) 6036 amdgpu_amdkfd_device_init(adev); 6037 6038 if (audio_suspended) 6039 amdgpu_device_resume_display_audio(tmp_adev); 6040 6041 amdgpu_device_unset_mp1_state(tmp_adev); 6042 6043 amdgpu_ras_set_error_query_ready(tmp_adev, true); 6044 } 6045 6046 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, 6047 reset_list); 6048 amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); 6049 6050 end_reset: 6051 if (hive) { 6052 mutex_unlock(&hive->hive_lock); 6053 amdgpu_put_xgmi_hive(hive); 6054 } 6055 6056 if (r) 6057 dev_info(adev->dev, "GPU reset end with ret = %d\n", r); 6058 6059 atomic_set(&adev->reset_domain->reset_res, r); 6060 > 6061 drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE); 6062 6063 return r; 6064 } 6065 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki