On Fri, Dec 6, 2019 at 5:08 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote: > > On Fri, Dec 06, 2019 at 08:09:48AM +0200, Ranran wrote: > > On Fri, Nov 29, 2019 at 8:38 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote: > > > > > > On Fri, Nov 29, 2019 at 06:10:51PM +0200, Ranran wrote: > > > > On Fri, Nov 29, 2019 at 4:58 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote: > > > > > On Fri, Nov 29, 2019 at 06:59:48AM +0000, bugzilla-daemon@xxxxxxxxxxxxxxxxxxx wrote: > > > > > > https://bugzilla.kernel.org/show_bug.cgi?id=205701 > > > I have tried to upgrade to latest kernel 5.4 (elrepo in centos), but > > with this processor/board (system x3650, Xeon), it get hang during > > kernel boot, without any error in dmesg, just keeps waiting for > > nothing for couple of minutes and than drops to dracut. > > - I don't think you ever said exactly what the original failure mode > was. You said DMA from an FPGA failed. What is the specific > device? How do you know the DMA fails? > Hi, FPGA is Intel's Arria 10 device. We know that DMA fails because on using signaltap/probing the DMA transaction from FPGA to CPU's RAM we see that it stall, i.e. keep waiting for the access to finish. We don't observe any error in dmesg. > - Re your v5.4 kernel testing, dracut is a user-space distro thing, so > it sounds like your hang is some sort of installation problem that I > can't really help you with. Maybe there are troubleshooting hints > at https://www.kernel.org/pub/linux/utils/boot/dracut/dracut.html. I know, that's quite frustrating. I tried to disable features using kernel arguments noacpi, noapic, but it still freeze somewhere without giving any error, > You may also be able to just drop a v5.4 kernel on your v4.18 > system, at least for testing purposes. > What does it mean to drop 5.4 kernel on 4.18 kernel ? > - Your comment #3 in bugzilla is a link to a Google Doc containing a > test module. In the future, please attach things as plain text > attachments directly to the bugzilla. There's an "Add attachment" > link immediately before the "Description" comment in bugzilla. I > did it for you this time. > > - It looks like your test_module.c is a kernel module, and frankly > it's a mess. Global variables that should be per-device, unused > variables (dma_get_mask() called for no reason), confused usage > (e.g., using both pci_dev_s and pPciDev), whitespace that appears > random, etc. I suggest starting with Documentation/PCI/pci.rst and, > at least for this debugging effort, making it a self-contained > driver instead of splitting things between a kernel module and > user-space. > I've attached latest kernel module, which I hope will make it more clear, I will try to make it a standalone test next time I'm in lab. > - Your comment #4 is a link to a Google Doc containing lspci output. > I attached it to bugzilla directly for you. > > - You apparently didn't run lspci as root ("sudo lspci -vv"), so it > is missing a lot of information. > > - Your lspci doesn't match either of the dmesg logs. Please make sure > all your logs are from the same machine in the same configuration. > For example, the first devices found by the kernel (from both > comments #1 and #2) are: > > pci 0000:00:00.0: [8086:3c00] type 00 class 0x060000 > pci 0000:00:01.0: [8086:3c02] type 01 class 0x060400 > pci 0000:00:02.0: [8086:3c04] type 01 class 0x060400 > pci 0000:00:02.2: [8086:3c06] type 01 class 0x060400 > ... > > But the lspci doesn't include 00:01.0, 00:02.0, or 00:02.2. It > shows: > > 00:00.0 Host bridge: Intel Corporation Device 2020 (rev 04) > 00:04.0 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04) > 00:04.1 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04) > 00:04.2 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04) > ... I will do it in lab tomorrow. Thanks.
#include <linux/init.h> #include <linux/module.h> #include <linux/pci.h> #include <linux/fs.h> #include <linux/poll.h> #include <linux/interrupt.h> #include <linux/io.h> #include <linux/wait.h> #include <linux/sched.h> #include <linux/cdev.h> #include <linux/pci.h> #include <linux/platform_device.h> #include "../../common/FioIoctl.h" #include "../../common/FioIn.h" int DeviceInit(struct pci_dev *pPciDev); static DECLARE_WAIT_QUEUE_HEAD(sWaitQueuePacket); static u32 sInterruptFlagPacket = 0; //static u32 sInterruptCounterPacket = 0; static void* sVirtualKernelCommonBuffer; static u64 sPhysicalKernelCommonBuffer; static u32 sPhysicalBar1Address; static u32 sBar1Length; static void* sVirtualBar1Address; static dma_addr_t sDmaHandle; struct pci_dev *pci_dev_s; MODULE_AUTHOR("Z.V"); MODULE_DESCRIPTION("rcm"); MODULE_LICENSE("GPL"); /**********************************************************************************/ u64 ReadWord64(u32 Offset) { return ioread64((void*)sVirtualBar1Address + Offset); } /**********************************************************************************/ void WriteWord64(u32 Offset, u64 Data) { iowrite64(Data, (void*)sVirtualBar1Address + Offset); } /**********************************************************************************/ static long FioIoctl (struct file *file,unsigned int IoctlCode,unsigned long IoctlParam) { WRITE_WORD64_REQUEST WriteWord64Request; READ_WORD64_REQUEST ReadWord64Request; GET_PHYSICAL_BUFFER_REQUEST GetPhysicalBufferRequest; int rc; //printk("--> FioIoctl. Minor=%x\n", Minor); switch (IoctlCode) { case WRITE_WORD64_REQUEST_CODE: //copy request from user space rc = copy_from_user(&WriteWord64Request, (void*)IoctlParam, sizeof(WRITE_WORD64_REQUEST)); if (rc) printk("FioIoctl: copy_from_user failed. rc=0x%x\n", rc); WriteWord64 (WriteWord64Request.Offset, WriteWord64Request.Data); printk ("WR: 0x%08x to 0x%08x\n", WriteWord64Request.Data, WriteWord64Request.Offset); break; case READ_WORD64_REQUEST_CODE: //copy request from user space rc = copy_from_user(&ReadWord64Request, (void*)IoctlParam, sizeof(READ_WORD64_REQUEST)); if (rc) printk("rcm_ioctl: copy_from_user failed. rc=0x%x\n", rc); ReadWord64Request.Data = ReadWord64 (ReadWord64Request.Offset); printk ("RD: 0x%08x from 0x%08x\n", ReadWord64Request.Data, ReadWord64Request.Offset); rc = copy_to_user((void*)IoctlParam, &ReadWord64Request, sizeof(READ_WORD64_REQUEST)); if (rc) printk("FioIoctl: copy_to_user failed. rc=0x%x\n", rc); break; case INTERRUPT_REQUEST_CODE: wait_event_interruptible(sWaitQueuePacket, sInterruptFlagPacket != 0); sInterruptFlagPacket = 0; break; case GET_PHYSICAL_BUFFER_REQUEST_CODE: rc = copy_from_user(&GetPhysicalBufferRequest, (void*)IoctlParam, sizeof(GET_PHYSICAL_BUFFER_REQUEST)); if (rc) printk("rcm_ioctl: copy_from_user failed. rc=0x%x\n", rc); GetPhysicalBufferRequest.Address = sPhysicalKernelCommonBuffer; rc = copy_to_user((void*)IoctlParam, &GetPhysicalBufferRequest, sizeof(GET_PHYSICAL_BUFFER_REQUEST)); if (rc) printk("FioIoctl: copy_to_user failed. rc=0x%x\n", rc); break; default: printk ("FioIoctl: invalid ioctl code(0x%x)\n", IoctlCode); break; } //printk("<-- FioIoctl\n"); return 0; } /**********************************************************************************/ static int FioMmap(struct file* flip, struct vm_area_struct *vma) { int rc; printk("-->FioMmap\n"); rc = remap_pfn_range (vma, vma->vm_start, sPhysicalKernelCommonBuffer >> PAGE_SHIFT, vma->vm_end - vma->vm_start, vma->vm_page_prot); if (rc) { printk ("rcm_mmap: remap_page_range failed. rc=%d\n",rc); return -1; } printk("<--FioMmap\n"); return 0; } /**********************************************************************************/ static irqreturn_t IrqHandlerPacket (int irq, void *data) { u32 Status; printk ("-->IrqHandler\n"); Status = ReadWord64(CARD_STATUS_OFFSET); if (Status ==0) return IRQ_NONE; WriteWord64(CARD_STATUS_OFFSET, Status); sInterruptFlagPacket = 1; wake_up_interruptible(&sWaitQueuePacket); printk ("<--IrqHandler\n"); return IRQ_HANDLED; } /**********************************************************************************/ static int FioOpen (struct inode *inode, struct file *fl) { printk("-->FioOpen\n"); printk("<--FioOpen\n"); return 0; } /**********************************************************************************/ static int FioRelease (struct inode *inode, struct file *fl) { printk("-->FioRelease\n"); printk("<--FioRelease\n"); return 0; } /**********************************************************************************/ static struct file_operations sDrvOperations = { unlocked_ioctl: FioIoctl, open : FioOpen, release : FioRelease, mmap : FioMmap, owner : THIS_MODULE }; /**********************************************************************************/ int AllocateCommonBuffer(struct pci_dev *pPciDev) { int rc; u64 mask; /*rc=dma_set_mask_and_coherent (&pPciDev->dev, DMA_BIT_MASK(64)); if (rc!=0) { printk ("dma_set_mask failed. rc=%d",rc); return -1; }*/ mask = dma_get_mask(&pPciDev->dev); printk("maks=0x%llx\n", mask); //Loop for allocating common buffer. sVirtualKernelCommonBuffer = dma_alloc_coherent ( &pPciDev->dev, COMMON_BUFFER_SIZE, &sDmaHandle, GFP_KERNEL | GFP_DMA); if (sVirtualKernelCommonBuffer == 0x0) { printk("FioInit: phys_to_virt failed\n"); return -1; } sPhysicalKernelCommonBuffer = (u64)sDmaHandle; printk("sPhysicalKernelCommonBuffer=0x%llx\n", sPhysicalKernelCommonBuffer); *(u32*)sVirtualKernelCommonBuffer = 0xCAFE2DAD; return 0; } /**********************************************************************************/ int DeviceInit (struct pci_dev *pPciDev) { int err; u32 Pattern; //enable device if (pci_enable_device(pPciDev)) { printk("FioInit: pci_enable_device failed\n"); return -EIO; } //find base address of BAR1 sPhysicalBar1Address = pci_resource_start(pPciDev, 1); printk ("FioInit: Bar1Address=0x%x\n", sPhysicalBar1Address); //find length of BAR1 sBar1Length = pci_resource_len (pPciDev, 1); printk ("FioInit: bar1_len_s=0x%x\n", sBar1Length); if (request_mem_region(sPhysicalBar1Address, sBar1Length,"fio")==NULL) { printk ("FioInit: request_mem_region failed\n"); return -1; } //find virtual address of bar1 sVirtualBar1Address = ioremap_nocache(sPhysicalBar1Address, sBar1Length); printk ("virtual_bar1_base_s=0x%p\n", sVirtualBar1Address); //Settings err = request_irq(pci_dev_s->irq, IrqHandlerPacket, IRQF_SHARED,"fio", pPciDev); if (err!=0) { printk("request_irq 0 failed.\n"); return EBUSY; } Pattern=ReadWord64 (FPGA_VERSION_OFFSET); printk ("Pattern=0x%08x\n",Pattern); if (AllocateCommonBuffer(pPciDev) != 0) return -1; return 0; } /**********************************************************************************/ static int __init FioInit(void) { int rc; printk("-->FioInit\n"); rc=register_chrdev(0, "fio", &sDrvOperations); if (rc < 0) { printk("FioInit: register_chrdev failed\n"); return -1; } printk("Major=%d\n", rc); //get device according to vendor,device pci_dev_s = pci_get_device(VENDOR_ID, DEVICE_ID, NULL); if (pci_dev_s == NULL) { printk("rcm_init: pci_get_device failed\n"); return -ENODEV; } DeviceInit(pci_dev_s); printk("<--FioInit\n"); return 0; } /**********************************************************************************/ static void __exit FioExit (void) { printk(KERN_ALERT "-->FioExit\n"); /*dma_free_coherent(&pci_dev_s->dev, COMMON_BUFFER_SIZE, sVirtualKernelCommonBuffer, sDmaHandle);*/ printk(KERN_ALERT "<--FioExit\n"); } module_init(FioInit); module_exit(FioExit);