On Mon, 2024-06-17 at 14:07 -0700, Bart Van Assche wrote: > > External email : Please do not click links or open attachments until > you have verified the sender or the content. > If ufshcd_abort() returns SUCCESS for an already completed command > then > that command is completed twice. This results in a crash. Prevent > this by > checking whether a command has completed without completion interrupt > from > the timeout handler. This CL fixes the following kernel crash: > > Unable to handle kernel NULL pointer dereference at virtual address > 0000000000000000 > Call trace: > dma_direct_map_sg+0x70/0x274 > scsi_dma_map+0x84/0x124 > ufshcd_queuecommand+0x3fc/0x880 > scsi_queue_rq+0x7d0/0x111c > blk_mq_dispatch_rq_list+0x440/0xebc > blk_mq_do_dispatch_sched+0x5a4/0x6b8 > __blk_mq_sched_dispatch_requests+0x150/0x220 > __blk_mq_run_hw_queue+0xf0/0x218 > __blk_mq_delay_run_hw_queue+0x8c/0x18c > blk_mq_run_hw_queue+0x1a4/0x360 > blk_mq_sched_insert_requests+0x130/0x334 > blk_mq_flush_plug_list+0x138/0x234 > blk_flush_plug_list+0x118/0x164 > blk_finish_plug() > read_pages+0x38c/0x408 > page_cache_ra_unbounded+0x230/0x2f8 > do_sync_mmap_readahead+0x1a4/0x208 > filemap_fault+0x27c/0x8f4 > f2fs_filemap_fault+0x28/0xfc > __do_fault+0xc4/0x208 > handle_pte_fault+0x290/0xe04 > do_handle_mm_fault+0x52c/0x858 > do_page_fault+0x5dc/0x798 > do_translation_fault+0x40/0x54 > do_mem_abort+0x60/0x134 > el0_da+0x40/0xb8 > el0t_64_sync_handler+0xc4/0xe4 > el0t_64_sync+0x1b4/0x1b8 > Hi Bart, This backtrace is ufshcd_queuecommand KE. If ufshcd_abort() complete an already completed command, it should be KE with ufshcd_abort backtrace? More, if a command is completed by irq. The rq may be release and ufshcd_mcq_req_to_hwq(hba, rq) will get KE Here is our backtrace of this case. platform +platform:112b0000.ufshci ufshcd-mtk 112b0000.ufshci: ufshcd_try_to_abort_task: cmd at tag 41 not pending in the device. platform +platform:112b0000.ufshci ufshcd-mtk 112b0000.ufshci: ufshcd_try_to_abort_task: cmd at tag=41 is cleared. platform +platform:112b0000.ufshci ufshcd-mtk 112b0000.ufshci: Aborting tag 41 / CDB 0x28 succeeded Unable to handle kernel NULL pointer dereference at virtual address 0000000000000194 pc : [0xffffffddd7a79bf8] blk_mq_unique_tag+0x8/0x14 lr : [0xffffffddd6155b84] ufshcd_mcq_req_to_hwq+0x1c/0x40 [ufs_mediatek_mod_ise] do_mem_abort+0x58/0x118 el1_abort+0x3c/0x5c el1h_64_sync_handler+0x54/0x90 el1h_64_sync+0x68/0x6c blk_mq_unique_tag+0x8/0x14 ufshcd_err_handler+0xae4/0xfa8 [ufs_mediatek_mod_ise] process_one_work+0x208/0x4fc worker_thread+0x228/0x438 kthread+0x104/0x1d4 ret_from_fork+0x10/0x20 Thanks. Peter > Signed-off-by: Bart Van Assche <bvanassche@xxxxxxx> > --- > drivers/ufs/core/ufshcd.c | 23 ++++++++++++++++++++++- > 1 file changed, 22 insertions(+), 1 deletion(-) > > diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c > index e3835e61e4b1..47cc0802c4f4 100644 > --- a/drivers/ufs/core/ufshcd.c > +++ b/drivers/ufs/core/ufshcd.c > @@ -8922,7 +8922,28 @@ static void ufshcd_async_scan(void *data, > async_cookie_t cookie) > > static enum scsi_timeout_action ufshcd_eh_timed_out(struct scsi_cmnd > *scmd) > { > -struct ufs_hba *hba = shost_priv(scmd->device->host); > +struct scsi_device *sdev = scmd->device; > +struct ufs_hba *hba = shost_priv(sdev->host); > +struct scsi_cmnd *cmd2 = scmd; > +const u32 unique_tag = blk_mq_unique_tag(scsi_cmd_to_rq(scmd)); > + > +WARN_ON_ONCE(!scmd); > + > +if (is_mcq_enabled(hba)) { > +struct request *rq = scsi_cmd_to_rq(scmd); > +struct ufs_hw_queue *hwq = ufshcd_mcq_req_to_hwq(hba, rq); > + > +ufshcd_mcq_poll_cqe_lock(hba, hwq, &cmd2); > +} else { > +__ufshcd_poll(hba->host, UFSHCD_POLL_FROM_INTERRUPT_CONTEXT, > + &cmd2); > +} > +if (cmd2 == NULL) { > +sdev_printk(KERN_INFO, sdev, > + "%s: cmd with tag %#x has already been completed\n", > + __func__, unique_tag); > +return SCSI_EH_DONE; > +} > > if (!hba->system_suspending) { > /* Activate the error handler in the SCSI core. */