Re: [Bug 205701] New: Can't access RAM from PCIe

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Fri, Dec 6, 2019 at 5:08 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote:
>
> On Fri, Dec 06, 2019 at 08:09:48AM +0200, Ranran wrote:
> > On Fri, Nov 29, 2019 at 8:38 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote:
> > >
> > > On Fri, Nov 29, 2019 at 06:10:51PM +0200, Ranran wrote:
> > > > On Fri, Nov 29, 2019 at 4:58 PM Bjorn Helgaas <helgaas@xxxxxxxxxx> wrote:
> > > > > On Fri, Nov 29, 2019 at 06:59:48AM +0000, bugzilla-daemon@xxxxxxxxxxxxxxxxxxx wrote:
> > > > > > https://bugzilla.kernel.org/show_bug.cgi?id=205701
>
> > I have tried to upgrade to latest kernel 5.4 (elrepo in centos), but
> > with this processor/board (system x3650, Xeon), it get hang during
> > kernel boot, without any error in dmesg, just keeps waiting for
> > nothing for couple of minutes and than drops to dracut.
>
> - I don't think you ever said exactly what the original failure mode
>   was.  You said DMA from an FPGA failed.  What is the specific
>   device?  How do you know the DMA fails?
>

Hi,
FPGA is Intel's Arria 10 device.
We know that DMA fails because on using signaltap/probing the DMA
transaction from FPGA to CPU's RAM we see that it stall, i.e. keep
waiting for the access to finish.
We don't observe any error in dmesg.


> - Re your v5.4 kernel testing, dracut is a user-space distro thing, so
>   it sounds like your hang is some sort of installation problem that I
>   can't really help you with.  Maybe there are troubleshooting hints
>   at https://www.kernel.org/pub/linux/utils/boot/dracut/dracut.html.

I know, that's quite frustrating. I tried to disable features using
kernel arguments noacpi, noapic, but it still freeze somewhere without
giving any error,

>   You may also be able to just drop a v5.4 kernel on your v4.18
>   system, at least for testing purposes.
>
What does it mean to drop 5.4 kernel on 4.18 kernel ?


> - Your comment #3 in bugzilla is a link to a Google Doc containing a
>   test module.  In the future, please attach things as plain text
>   attachments directly to the bugzilla.  There's an "Add attachment"
>   link immediately before the "Description" comment in bugzilla.  I
>   did it for you this time.
>
> - It looks like your test_module.c is a kernel module, and frankly
>   it's a mess.  Global variables that should be per-device, unused
>   variables (dma_get_mask() called for no reason), confused usage
>   (e.g., using both pci_dev_s and pPciDev), whitespace that appears
>   random, etc.  I suggest starting with Documentation/PCI/pci.rst and,
>   at least for this debugging effort, making it a self-contained
>   driver instead of splitting things between a kernel module and
>   user-space.
>

I've attached latest kernel module, which I hope will make it more
clear, I will try to make it a standalone test next time I'm in lab.

> - Your comment #4 is a link to a Google Doc containing lspci output.
>   I attached it to bugzilla directly for you.
>
> - You apparently didn't run lspci as root ("sudo lspci -vv"), so it
>   is missing a lot of information.
>
> - Your lspci doesn't match either of the dmesg logs.  Please make sure
>   all your logs are from the same machine in the same configuration.
>   For example, the first devices found by the kernel (from both
>   comments #1 and #2) are:
>
>     pci 0000:00:00.0: [8086:3c00] type 00 class 0x060000
>     pci 0000:00:01.0: [8086:3c02] type 01 class 0x060400
>     pci 0000:00:02.0: [8086:3c04] type 01 class 0x060400
>     pci 0000:00:02.2: [8086:3c06] type 01 class 0x060400
>     ...
>
>   But the lspci doesn't include 00:01.0, 00:02.0, or 00:02.2.  It
>   shows:
>
>     00:00.0 Host bridge: Intel Corporation Device 2020 (rev 04)
>     00:04.0 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04)
>     00:04.1 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04)
>     00:04.2 System peripheral: Intel Corporation Sky Lake-E CBDMA Registers (rev 04)
>     ...

 I will do it in lab tomorrow. Thanks.
#include <linux/init.h>
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/wait.h>
#include <linux/sched.h>
#include <linux/cdev.h>
#include <linux/pci.h>
#include <linux/platform_device.h>

#include "../../common/FioIoctl.h"
#include "../../common/FioIn.h"

int DeviceInit(struct pci_dev *pPciDev);

static DECLARE_WAIT_QUEUE_HEAD(sWaitQueuePacket);

static u32 sInterruptFlagPacket = 0;

//static u32 sInterruptCounterPacket = 0;

static	void*		sVirtualKernelCommonBuffer;
static  u64			sPhysicalKernelCommonBuffer;
static	u32     	sPhysicalBar1Address;
static	u32			sBar1Length;
static	void*		sVirtualBar1Address;
static  dma_addr_t 	sDmaHandle;
struct  pci_dev		*pci_dev_s;

MODULE_AUTHOR("Z.V");
MODULE_DESCRIPTION("rcm");
MODULE_LICENSE("GPL");

/**********************************************************************************/
u64 ReadWord64(u32 Offset)
{
	return ioread64((void*)sVirtualBar1Address + Offset);
}

/**********************************************************************************/
void WriteWord64(u32 Offset, u64 Data)
{
	iowrite64(Data, (void*)sVirtualBar1Address + Offset);
}

/**********************************************************************************/
static long FioIoctl (struct file *file,unsigned int IoctlCode,unsigned long IoctlParam)
{
	WRITE_WORD64_REQUEST WriteWord64Request;
	READ_WORD64_REQUEST ReadWord64Request;
	GET_PHYSICAL_BUFFER_REQUEST GetPhysicalBufferRequest;

	int rc;

	//printk("--> FioIoctl. Minor=%x\n", Minor);

	switch (IoctlCode)
	{
	case WRITE_WORD64_REQUEST_CODE:
		//copy request from user space
		rc = copy_from_user(&WriteWord64Request, (void*)IoctlParam, sizeof(WRITE_WORD64_REQUEST));
		if (rc)
			printk("FioIoctl: copy_from_user failed. rc=0x%x\n", rc);
		WriteWord64 (WriteWord64Request.Offset, WriteWord64Request.Data);
		printk ("WR: 0x%08x to   0x%08x\n", WriteWord64Request.Data, WriteWord64Request.Offset);
		break;

	case READ_WORD64_REQUEST_CODE:
		//copy request from user space
		rc = copy_from_user(&ReadWord64Request, (void*)IoctlParam, sizeof(READ_WORD64_REQUEST));
		if (rc)
			printk("rcm_ioctl: copy_from_user failed. rc=0x%x\n", rc);
		ReadWord64Request.Data = ReadWord64 (ReadWord64Request.Offset);
		printk ("RD: 0x%08x from 0x%08x\n", ReadWord64Request.Data, ReadWord64Request.Offset);
		rc = copy_to_user((void*)IoctlParam, &ReadWord64Request, sizeof(READ_WORD64_REQUEST));
		if (rc)
			printk("FioIoctl: copy_to_user failed. rc=0x%x\n", rc);
		break;

	case INTERRUPT_REQUEST_CODE:
		wait_event_interruptible(sWaitQueuePacket, sInterruptFlagPacket != 0);
		sInterruptFlagPacket = 0;
		break;

	case GET_PHYSICAL_BUFFER_REQUEST_CODE:
		rc = copy_from_user(&GetPhysicalBufferRequest, (void*)IoctlParam, sizeof(GET_PHYSICAL_BUFFER_REQUEST));
		if (rc)
			printk("rcm_ioctl: copy_from_user failed. rc=0x%x\n", rc);

		GetPhysicalBufferRequest.Address = sPhysicalKernelCommonBuffer;

		rc = copy_to_user((void*)IoctlParam, &GetPhysicalBufferRequest, sizeof(GET_PHYSICAL_BUFFER_REQUEST));
		if (rc)
			printk("FioIoctl: copy_to_user failed. rc=0x%x\n", rc);
		break;

	default:
		printk ("FioIoctl: invalid ioctl code(0x%x)\n", IoctlCode);
		break;
	}
	//printk("<-- FioIoctl\n");

	return 0;
}

/**********************************************************************************/
static int FioMmap(struct file* flip, struct vm_area_struct *vma)
{
	int rc;
	
	printk("-->FioMmap\n");
	rc = remap_pfn_range  (vma, 
							vma->vm_start, 
							sPhysicalKernelCommonBuffer >> PAGE_SHIFT,
							vma->vm_end - vma->vm_start,
							vma->vm_page_prot);

	if (rc)
	{
		printk ("rcm_mmap: remap_page_range failed. rc=%d\n",rc);
		return -1;
	}
	printk("<--FioMmap\n");

	return 0;
}

/**********************************************************************************/
static irqreturn_t IrqHandlerPacket (int irq, void *data)
{
	u32 Status;

	printk ("-->IrqHandler\n");
	Status = ReadWord64(CARD_STATUS_OFFSET);
	if (Status ==0)
		return IRQ_NONE;

	WriteWord64(CARD_STATUS_OFFSET, Status);

	sInterruptFlagPacket = 1;
	wake_up_interruptible(&sWaitQueuePacket);
	printk ("<--IrqHandler\n");

	return IRQ_HANDLED;
}

/**********************************************************************************/
static int FioOpen (struct inode *inode, struct file *fl)
{
	printk("-->FioOpen\n");

	printk("<--FioOpen\n");
	return 0;
}

/**********************************************************************************/
static int FioRelease (struct inode *inode, struct file *fl)
{
	printk("-->FioRelease\n");
	printk("<--FioRelease\n");
	return 0;
}

/**********************************************************************************/
static struct file_operations sDrvOperations =
{
	unlocked_ioctl: FioIoctl,
	open	: FioOpen,
	release : FioRelease,
	mmap	: FioMmap,
	owner	: THIS_MODULE
};

/**********************************************************************************/
int AllocateCommonBuffer(struct pci_dev *pPciDev)
{
	int rc;
	u64 mask;

	/*rc=dma_set_mask_and_coherent (&pPciDev->dev, DMA_BIT_MASK(64));
	if (rc!=0)
	{
		printk ("dma_set_mask failed. rc=%d",rc);
		return -1;
	}*/
	mask = dma_get_mask(&pPciDev->dev);
	printk("maks=0x%llx\n", mask);

	//Loop for allocating common buffer. 
	sVirtualKernelCommonBuffer = dma_alloc_coherent (
							&pPciDev->dev, 
							COMMON_BUFFER_SIZE,
							&sDmaHandle,
							GFP_KERNEL | GFP_DMA);
	if (sVirtualKernelCommonBuffer == 0x0)
	{
		printk("FioInit: phys_to_virt failed\n");
		return -1;
	}

	sPhysicalKernelCommonBuffer = (u64)sDmaHandle;

	printk("sPhysicalKernelCommonBuffer=0x%llx\n", sPhysicalKernelCommonBuffer);
	*(u32*)sVirtualKernelCommonBuffer = 0xCAFE2DAD;

	return 0;
}


/**********************************************************************************/
int DeviceInit (struct pci_dev *pPciDev)
{
	int err;
	u32 Pattern;

	//enable device
	if (pci_enable_device(pPciDev))
	{
		printk("FioInit: pci_enable_device failed\n");
		return -EIO;
	}

	//find base address of BAR1
	sPhysicalBar1Address = pci_resource_start(pPciDev, 1);
	printk ("FioInit: Bar1Address=0x%x\n", sPhysicalBar1Address);

	//find length of BAR1
	sBar1Length = pci_resource_len (pPciDev, 1);
	printk ("FioInit: bar1_len_s=0x%x\n", sBar1Length);

	if (request_mem_region(sPhysicalBar1Address, sBar1Length,"fio")==NULL)
	{
		printk ("FioInit: request_mem_region failed\n");
		return -1;
	}
	
	//find virtual address of bar1
	sVirtualBar1Address = ioremap_nocache(sPhysicalBar1Address, sBar1Length);
	printk ("virtual_bar1_base_s=0x%p\n", sVirtualBar1Address);

	//Settings
	err = request_irq(pci_dev_s->irq, IrqHandlerPacket, IRQF_SHARED,"fio", pPciDev);
	if (err!=0)
	{
		printk("request_irq 0 failed.\n");
		return EBUSY;
	}

	Pattern=ReadWord64 (FPGA_VERSION_OFFSET);
	printk ("Pattern=0x%08x\n",Pattern);

	if (AllocateCommonBuffer(pPciDev) != 0)
		return -1;

	return 0;
}
/**********************************************************************************/
static int __init FioInit(void)
{
	int rc;

	printk("-->FioInit\n");

	rc=register_chrdev(0, "fio", &sDrvOperations);
	if (rc < 0)
	{
		printk("FioInit: register_chrdev failed\n");
		return -1;
	}

	printk("Major=%d\n", rc);

	//get device according to vendor,device
	pci_dev_s = pci_get_device(VENDOR_ID, DEVICE_ID, NULL);
	if (pci_dev_s == NULL)
	{
		printk("rcm_init: pci_get_device failed\n");
		return -ENODEV;
	}

	DeviceInit(pci_dev_s);

	printk("<--FioInit\n");
	return 0;
}

/**********************************************************************************/
static void __exit FioExit (void)
{
	printk(KERN_ALERT "-->FioExit\n");

	/*dma_free_coherent(&pci_dev_s->dev,
						COMMON_BUFFER_SIZE,
						sVirtualKernelCommonBuffer,
						sDmaHandle);*/
						
	printk(KERN_ALERT "<--FioExit\n");
}

module_init(FioInit);
module_exit(FioExit);


[Index of Archives]     [DMA Engine]     [Linux Coverity]     [Linux USB]     [Video for Linux]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]     [Linux SCSI]     [Greybus]

  Powered by Linux