[linux-next:master 12502/12880] drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: sparse: incorrect type in argument 1 (different address spaces)

kernel test robot <lkp@xxxxxxxxx> · Sat, 13 Jul 2024 10:53:23 +0800

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
head:   3fe121b622825ff8cc995a1e6b026181c48188db
commit: f6ca930d974e473fd608fc9aa1759fbe731fe44d [12502/12880] drm/xe: Add process name and PID to job timedout message
config: x86_64-randconfig-123-20240713 (https://download.01.org/0day-ci/archive/20240713/202407131049.PQilkG1A-lkp@xxxxxxxxx/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240713/202407131049.PQilkG1A-lkp@xxxxxxxxx/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@xxxxxxxxx>
| Closes: https://lore.kernel.org/oe-kbuild-all/202407131049.PQilkG1A-lkp@xxxxxxxxx/

sparse warnings: (new ones prefixed by >>)
>> drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse: sparse: incorrect type in argument 1 (different address spaces) @@     expected struct pid *pid @@     got struct pid [noderef] __rcu *pid @@
   drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse:     expected struct pid *pid
   drivers/gpu/drm/xe/xe_guc_submit.c:1164:52: sparse:     got struct pid [noderef] __rcu *pid

vim +1164 drivers/gpu/drm/xe/xe_guc_submit.c

  1054	
  1055	static enum drm_gpu_sched_stat
  1056	guc_exec_queue_timedout_job(struct drm_sched_job *drm_job)
  1057	{
  1058		struct xe_sched_job *job = to_xe_sched_job(drm_job);
  1059		struct xe_sched_job *tmp_job;
  1060		struct xe_exec_queue *q = job->q;
  1061		struct xe_gpu_scheduler *sched = &q->guc->sched;
  1062		struct xe_guc *guc = exec_queue_to_guc(q);
  1063		const char *process_name = "no process";
  1064		struct task_struct *task = NULL;
  1065		int err = -ETIME;
  1066		pid_t pid = -1;
  1067		int i = 0;
  1068		bool wedged, skip_timeout_check;
  1069	
  1070		/*
  1071		 * TDR has fired before free job worker. Common if exec queue
  1072		 * immediately closed after last fence signaled.
  1073		 */
  1074		if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &job->fence->flags)) {
  1075			guc_exec_queue_free_job(drm_job);
  1076	
  1077			return DRM_GPU_SCHED_STAT_NOMINAL;
  1078		}
  1079	
  1080		/* Kill the run_job entry point */
  1081		xe_sched_submission_stop(sched);
  1082	
  1083		/* Must check all state after stopping scheduler */
  1084		skip_timeout_check = exec_queue_reset(q) ||
  1085			exec_queue_killed_or_banned_or_wedged(q) ||
  1086			exec_queue_destroyed(q);
  1087	
  1088		/* Job hasn't started, can't be timed out */
  1089		if (!skip_timeout_check && !xe_sched_job_started(job))
  1090			goto rearm;
  1091	
  1092		/*
  1093		 * XXX: Sampling timeout doesn't work in wedged mode as we have to
  1094		 * modify scheduling state to read timestamp. We could read the
  1095		 * timestamp from a register to accumulate current running time but this
  1096		 * doesn't work for SRIOV. For now assuming timeouts in wedged mode are
  1097		 * genuine timeouts.
  1098		 */
  1099		wedged = guc_submit_hint_wedged(exec_queue_to_guc(q));
  1100	
  1101		/* Engine state now stable, disable scheduling to check timestamp */
  1102		if (!wedged && exec_queue_registered(q)) {
  1103			int ret;
  1104	
  1105			if (exec_queue_reset(q))
  1106				err = -EIO;
  1107	
  1108			if (!exec_queue_destroyed(q)) {
  1109				/*
  1110				 * Wait for any pending G2H to flush out before
  1111				 * modifying state
  1112				 */
  1113				ret = wait_event_timeout(guc->ct.wq,
  1114							 !exec_queue_pending_enable(q) ||
  1115							 guc_read_stopped(guc), HZ * 5);
  1116				if (!ret || guc_read_stopped(guc))
  1117					goto trigger_reset;
  1118	
  1119				/*
  1120				 * Flag communicates to G2H handler that schedule
  1121				 * disable originated from a timeout check. The G2H then
  1122				 * avoid triggering cleanup or deregistering the exec
  1123				 * queue.
  1124				 */
  1125				set_exec_queue_check_timeout(q);
  1126				disable_scheduling(q, skip_timeout_check);
  1127			}
  1128	
  1129			/*
  1130			 * Must wait for scheduling to be disabled before signalling
  1131			 * any fences, if GT broken the GT reset code should signal us.
  1132			 *
  1133			 * FIXME: Tests can generate a ton of 0x6000 (IOMMU CAT fault
  1134			 * error) messages which can cause the schedule disable to get
  1135			 * lost. If this occurs, trigger a GT reset to recover.
  1136			 */
  1137			smp_rmb();
  1138			ret = wait_event_timeout(guc->ct.wq,
  1139						 !exec_queue_pending_disable(q) ||
  1140						 guc_read_stopped(guc), HZ * 5);
  1141			if (!ret || guc_read_stopped(guc)) {
  1142	trigger_reset:
  1143				if (!ret)
  1144					xe_gt_warn(guc_to_gt(guc), "Schedule disable failed to respond");
  1145				set_exec_queue_extra_ref(q);
  1146				xe_exec_queue_get(q);	/* GT reset owns this */
  1147				set_exec_queue_banned(q);
  1148				xe_gt_reset_async(q->gt);
  1149				xe_sched_tdr_queue_imm(sched);
  1150				goto rearm;
  1151			}
  1152		}
  1153	
  1154		/*
  1155		 * Check if job is actually timed out, if so restart job execution and TDR
  1156		 */
  1157		if (!wedged && !skip_timeout_check && !check_timeout(q, job) &&
  1158		    !exec_queue_reset(q) && exec_queue_registered(q)) {
  1159			clear_exec_queue_check_timeout(q);
  1160			goto sched_enable;
  1161		}
  1162	
  1163		if (q->vm && q->vm->xef) {
> 1164			task = get_pid_task(q->vm->xef->drm->pid, PIDTYPE_PID);
  1165			if (task) {
  1166				process_name = task->comm;
  1167				pid = task->pid;
  1168			}
  1169		}
  1170		xe_gt_notice(guc_to_gt(guc), "Timedout job: seqno=%u, lrc_seqno=%u, guc_id=%d, flags=0x%lx in %s [%d]",
  1171			     xe_sched_job_seqno(job), xe_sched_job_lrc_seqno(job),
  1172			     q->guc->id, q->flags, process_name, pid);
  1173		if (task)
  1174			put_task_struct(task);
  1175	
  1176		trace_xe_sched_job_timedout(job);
  1177	
  1178		if (!exec_queue_killed(q))
  1179			xe_devcoredump(job);
  1180	
  1181		/*
  1182		 * Kernel jobs should never fail, nor should VM jobs if they do
  1183		 * somethings has gone wrong and the GT needs a reset
  1184		 */
  1185		xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_KERNEL,
  1186			   "Kernel-submitted job timed out\n");
  1187		xe_gt_WARN(q->gt, q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q),
  1188			   "VM job timed out on non-killed execqueue\n");
  1189		if (!wedged && (q->flags & EXEC_QUEUE_FLAG_KERNEL ||
  1190				(q->flags & EXEC_QUEUE_FLAG_VM && !exec_queue_killed(q)))) {
  1191			if (!xe_sched_invalidate_job(job, 2)) {
  1192				clear_exec_queue_check_timeout(q);
  1193				xe_gt_reset_async(q->gt);
  1194				goto rearm;
  1195			}
  1196		}
  1197	
  1198		/* Finish cleaning up exec queue via deregister */
  1199		set_exec_queue_banned(q);
  1200		if (!wedged && exec_queue_registered(q) && !exec_queue_destroyed(q)) {
  1201			set_exec_queue_extra_ref(q);
  1202			xe_exec_queue_get(q);
  1203			__deregister_exec_queue(guc, q);
  1204		}
  1205	
  1206		/* Stop fence signaling */
  1207		xe_hw_fence_irq_stop(q->fence_irq);
  1208	
  1209		/*
  1210		 * Fence state now stable, stop / start scheduler which cleans up any
  1211		 * fences that are complete
  1212		 */
  1213		xe_sched_add_pending_job(sched, job);
  1214		xe_sched_submission_start(sched);
  1215	
  1216		xe_guc_exec_queue_trigger_cleanup(q);
  1217	
  1218		/* Mark all outstanding jobs as bad, thus completing them */
  1219		spin_lock(&sched->base.job_list_lock);
  1220		list_for_each_entry(tmp_job, &sched->base.pending_list, drm.list)
  1221			xe_sched_job_set_error(tmp_job, !i++ ? err : -ECANCELED);
  1222		spin_unlock(&sched->base.job_list_lock);
  1223	
  1224		/* Start fence signaling */
  1225		xe_hw_fence_irq_start(q->fence_irq);
  1226	
  1227		return DRM_GPU_SCHED_STAT_NOMINAL;
  1228	
  1229	sched_enable:
  1230		enable_scheduling(q);
  1231	rearm:
  1232		/*
  1233		 * XXX: Ideally want to adjust timeout based on current exection time
  1234		 * but there is not currently an easy way to do in DRM scheduler. With
  1235		 * some thought, do this in a follow up.
  1236		 */
  1237		xe_sched_add_pending_job(sched, job);
  1238		xe_sched_submission_start(sched);
  1239	
  1240		return DRM_GPU_SCHED_STAT_NOMINAL;
  1241	}
  1242	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki