Re: [PATCH v2 06/12] drivers/soc: Add Aspeed XDMA Engine Driver

"Andrew Jeffery" <andrew@xxxxxxxx> · Wed, 11 Dec 2019 14:17:51 +1030

On Fri, 6 Dec 2019, at 03:45, Eddie James wrote:
> The XDMA engine embedded in the AST2500 and AST2600 SOCs performs PCI
> DMA operations between the SOC (acting as a BMC) and a host processor
> in a server.
> 
> This commit adds a driver to control the XDMA engine and adds functions
> to initialize the hardware and memory and start DMA operations.
> 
> Signed-off-by: Eddie James <eajames@xxxxxxxxxxxxx>
> ---
> Changes since v1:
>  - Add lock comments
>  - Don't split reset into start/finish
>  - Drop client_lock as the new reset method means client data is safe
>  - Switch to a chip structure to control versions rather than enum/switch
>  - Drop in_progress bool in favor or current_client being NULL or not
>  - Get SDRAM controller from dts phandle
>  - Configure PCI-E based on dts property
>  - Get VGA memory space from dts property
>  - Drop bmc_addr from aspeed_xdma_op as mmap can do all that
> 
>  MAINTAINERS                      |   2 +
>  drivers/soc/aspeed/Kconfig       |   8 +
>  drivers/soc/aspeed/Makefile      |   1 +
>  drivers/soc/aspeed/aspeed-xdma.c | 783 +++++++++++++++++++++++++++++++++++++++
>  include/uapi/linux/aspeed-xdma.h |  41 ++
>  5 files changed, 835 insertions(+)
>  create mode 100644 drivers/soc/aspeed/aspeed-xdma.c
>  create mode 100644 include/uapi/linux/aspeed-xdma.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 528a142..617c03d 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2712,6 +2712,8 @@ M:	Eddie James <eajames@xxxxxxxxxxxxx>
>  L:	linux-aspeed@xxxxxxxxxxxxxxxx (moderated for non-subscribers)
>  S:	Maintained
>  F:	Documentation/devicetree/bindings/soc/aspeed/xdma.txt
> +F:	drivers/soc/aspeed/aspeed-xdma.c
> +F:	include/uapi/linux/aspeed-xdma.h
>  
>  ASUS NOTEBOOKS AND EEEPC ACPI/WMI EXTRAS DRIVERS
>  M:	Corentin Chary <corentin.chary@xxxxxxxxx>
> diff --git a/drivers/soc/aspeed/Kconfig b/drivers/soc/aspeed/Kconfig
> index 323e177..2a6c16f 100644
> --- a/drivers/soc/aspeed/Kconfig
> +++ b/drivers/soc/aspeed/Kconfig
> @@ -29,4 +29,12 @@ config ASPEED_P2A_CTRL
>  	  ioctl()s, the driver also provides an interface for userspace mappings to
>  	  a pre-defined region.
>  
> +config ASPEED_XDMA
> +	tristate "Aspeed XDMA Engine Driver"
> +	depends on SOC_ASPEED && REGMAP && MFD_SYSCON && HAS_DMA
> +	help
> +	  Enable support for the Aspeed XDMA Engine found on the Aspeed AST2XXX
> +	  SOCs. The XDMA engine can perform automatic PCI DMA operations
> +	  between the AST2XXX (acting as a BMC) and a host processor.
> +
>  endmenu
> diff --git a/drivers/soc/aspeed/Makefile b/drivers/soc/aspeed/Makefile
> index b64be47..977b046 100644
> --- a/drivers/soc/aspeed/Makefile
> +++ b/drivers/soc/aspeed/Makefile
> @@ -2,3 +2,4 @@
>  obj-$(CONFIG_ASPEED_LPC_CTRL)	+= aspeed-lpc-ctrl.o
>  obj-$(CONFIG_ASPEED_LPC_SNOOP)	+= aspeed-lpc-snoop.o
>  obj-$(CONFIG_ASPEED_P2A_CTRL)	+= aspeed-p2a-ctrl.o
> +obj-$(CONFIG_ASPEED_XDMA)	+= aspeed-xdma.o
> diff --git a/drivers/soc/aspeed/aspeed-xdma.c b/drivers/soc/aspeed/aspeed-xdma.c
> new file mode 100644
> index 0000000..a9b3eeb
> --- /dev/null
> +++ b/drivers/soc/aspeed/aspeed-xdma.c
> @@ -0,0 +1,783 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +// Copyright IBM Corp 2019
> +
> +#include <linux/aspeed-xdma.h>
> +#include <linux/bitfield.h>
> +#include <linux/clk.h>
> +#include <linux/delay.h>
> +#include <linux/device.h>
> +#include <linux/dma-mapping.h>
> +#include <linux/fs.h>
> +#include <linux/genalloc.h>
> +#include <linux/interrupt.h>
> +#include <linux/io.h>
> +#include <linux/jiffies.h>
> +#include <linux/mfd/syscon.h>
> +#include <linux/module.h>
> +#include <linux/mutex.h>
> +#include <linux/of_device.h>
> +#include <linux/platform_device.h>
> +#include <linux/poll.h>
> +#include <linux/regmap.h>
> +#include <linux/reset.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/string.h>
> +#include <linux/uaccess.h>
> +#include <linux/wait.h>
> +#include <linux/workqueue.h>
> +
> +#define DEVICE_NAME				"aspeed-xdma"
> +
> +#define SCU_AST2500_PCIE_CONF			0x180
> +#define SCU_AST2600_PCIE_CONF			0xc20
> +#define  SCU_PCIE_CONF_VGA_EN			 BIT(0)
> +#define  SCU_PCIE_CONF_VGA_EN_MMIO		 BIT(1)
> +#define  SCU_PCIE_CONF_VGA_EN_LPC		 BIT(2)
> +#define  SCU_PCIE_CONF_VGA_EN_MSI		 BIT(3)
> +#define  SCU_PCIE_CONF_VGA_EN_MCTP		 BIT(4)
> +#define  SCU_PCIE_CONF_VGA_EN_IRQ		 BIT(5)
> +#define  SCU_PCIE_CONF_VGA_EN_DMA		 BIT(6)
> +#define  SCU_PCIE_CONF_BMC_EN			 BIT(8)
> +#define  SCU_PCIE_CONF_BMC_EN_MMIO		 BIT(9)
> +#define  SCU_PCIE_CONF_BMC_EN_MSI		 BIT(11)
> +#define  SCU_PCIE_CONF_BMC_EN_MCTP		 BIT(12)
> +#define  SCU_PCIE_CONF_BMC_EN_IRQ		 BIT(13)
> +#define  SCU_PCIE_CONF_BMC_EN_DMA		 BIT(14)
> +
> +#define SCU_AST2500_BMC_CLASS_REV		0x19c
> +#define SCU_AST2600_BMC_CLASS_REV		0xc4c
> +#define  SCU_BMC_CLASS_REV_XDMA			 0xff000001
> +
> +#define SDMC_REMAP				0x008
> +#define  SDMC_AST2500_REMAP_PCIE		 BIT(16)
> +#define  SDMC_AST2500_REMAP_XDMA		 BIT(17)
> +#define  SDMC_AST2600_REMAP_XDMA		 BIT(18)
> +
> +#define XDMA_CMDQ_SIZE				PAGE_SIZE
> +#define XDMA_NUM_CMDS				\
> +	(XDMA_CMDQ_SIZE / sizeof(struct aspeed_xdma_cmd))
> +
> +/* Aspeed specification requires 10ms after switching the reset line */
> +#define XDMA_RESET_TIME_MS			10
> +
> +#define XDMA_CMD_AST2500_PITCH_SHIFT		3
> +#define XDMA_CMD_AST2500_PITCH_BMC		GENMASK_ULL(62, 51)
> +#define XDMA_CMD_AST2500_PITCH_HOST		GENMASK_ULL(46, 35)
> +#define XDMA_CMD_AST2500_PITCH_UPSTREAM		BIT_ULL(31)
> +#define XDMA_CMD_AST2500_PITCH_ADDR		GENMASK_ULL(29, 4)
> +#define XDMA_CMD_AST2500_PITCH_ID		BIT_ULL(0)
> +#define XDMA_CMD_AST2500_CMD_IRQ_EN		BIT_ULL(31)
> +#define XDMA_CMD_AST2500_CMD_LINE_NO		GENMASK_ULL(27, 16)
> +#define XDMA_CMD_AST2500_CMD_IRQ_BMC		BIT_ULL(15)
> +#define XDMA_CMD_AST2500_CMD_LINE_SIZE_SHIFT	4
> +#define XDMA_CMD_AST2500_CMD_LINE_SIZE		\
> +	GENMASK_ULL(14, XDMA_CMD_AST2500_CMD_LINE_SIZE_SHIFT)
> +#define XDMA_CMD_AST2500_CMD_ID			BIT_ULL(1)
> +
> +#define XDMA_CMD_AST2600_PITCH_BMC		GENMASK_ULL(62, 48)
> +#define XDMA_CMD_AST2600_PITCH_HOST		GENMASK_ULL(46, 32)
> +#define XDMA_CMD_AST2600_PITCH_ADDR		GENMASK_ULL(30, 0)
> +#define XDMA_CMD_AST2600_CMD_64_EN		BIT_ULL(40)
> +#define XDMA_CMD_AST2600_CMD_IRQ_BMC		BIT_ULL(37)
> +#define XDMA_CMD_AST2600_CMD_IRQ_HOST		BIT_ULL(36)
> +#define XDMA_CMD_AST2600_CMD_UPSTREAM		BIT_ULL(32)
> +#define XDMA_CMD_AST2600_CMD_LINE_NO		GENMASK_ULL(27, 16)
> +#define XDMA_CMD_AST2600_CMD_LINE_SIZE		GENMASK_ULL(14, 0)
> +#define XDMA_CMD_AST2600_CMD_MULTILINE_SIZE	GENMASK_ULL(14, 12)
> +
> +#define XDMA_AST2500_QUEUE_ENTRY_SIZE		4
> +#define XDMA_AST2500_HOST_CMDQ_ADDR0		0x00
> +#define XDMA_AST2500_HOST_CMDQ_ENDP		0x04
> +#define XDMA_AST2500_HOST_CMDQ_WRITEP		0x08
> +#define XDMA_AST2500_HOST_CMDQ_READP		0x0c
> +#define XDMA_AST2500_BMC_CMDQ_ADDR		0x10
> +#define XDMA_AST2500_BMC_CMDQ_ENDP		0x14
> +#define XDMA_AST2500_BMC_CMDQ_WRITEP		0x18
> +#define XDMA_AST2500_BMC_CMDQ_READP		0x1c
> +#define  XDMA_BMC_CMDQ_READP_RESET		 0xee882266
> +#define XDMA_AST2500_CTRL			0x20
> +#define  XDMA_AST2500_CTRL_US_COMP		 BIT(4)
> +#define  XDMA_AST2500_CTRL_DS_COMP		 BIT(5)
> +#define  XDMA_AST2500_CTRL_DS_DIRTY		 BIT(6)
> +#define  XDMA_AST2500_CTRL_DS_SIZE_256		 BIT(17)
> +#define  XDMA_AST2500_CTRL_DS_TIMEOUT		 BIT(28)
> +#define  XDMA_AST2500_CTRL_DS_CHECK_ID		 BIT(29)
> +#define XDMA_AST2500_STATUS			0x24
> +#define  XDMA_AST2500_STATUS_US_COMP		 BIT(4)
> +#define  XDMA_AST2500_STATUS_DS_COMP		 BIT(5)
> +#define  XDMA_AST2500_STATUS_DS_DIRTY		 BIT(6)
> +#define XDMA_AST2500_INPRG_DS_CMD1		0x38
> +#define XDMA_AST2500_INPRG_DS_CMD2		0x3c
> +#define XDMA_AST2500_INPRG_US_CMD00		0x40
> +#define XDMA_AST2500_INPRG_US_CMD01		0x44
> +#define XDMA_AST2500_INPRG_US_CMD10		0x48
> +#define XDMA_AST2500_INPRG_US_CMD11		0x4c
> +#define XDMA_AST2500_INPRG_US_CMD20		0x50
> +#define XDMA_AST2500_INPRG_US_CMD21		0x54
> +#define XDMA_AST2500_HOST_CMDQ_ADDR1		0x60
> +#define XDMA_AST2500_VGA_CMDQ_ADDR0		0x64
> +#define XDMA_AST2500_VGA_CMDQ_ENDP		0x68
> +#define XDMA_AST2500_VGA_CMDQ_WRITEP		0x6c
> +#define XDMA_AST2500_VGA_CMDQ_READP		0x70
> +#define XDMA_AST2500_VGA_CMD_STATUS		0x74
> +#define XDMA_AST2500_VGA_CMDQ_ADDR1		0x78
> +
> +#define XDMA_AST2600_QUEUE_ENTRY_SIZE		2
> +#define XDMA_AST2600_HOST_CMDQ_ADDR0		0x00
> +#define XDMA_AST2600_HOST_CMDQ_ADDR1		0x04
> +#define XDMA_AST2600_HOST_CMDQ_ENDP		0x08
> +#define XDMA_AST2600_HOST_CMDQ_WRITEP		0x0c
> +#define XDMA_AST2600_HOST_CMDQ_READP		0x10
> +#define XDMA_AST2600_BMC_CMDQ_ADDR		0x14
> +#define XDMA_AST2600_BMC_CMDQ_ENDP		0x18
> +#define XDMA_AST2600_BMC_CMDQ_WRITEP		0x1c
> +#define XDMA_AST2600_BMC_CMDQ_READP		0x20
> +#define XDMA_AST2600_VGA_CMDQ_ADDR0		0x24
> +#define XDMA_AST2600_VGA_CMDQ_ADDR1		0x28
> +#define XDMA_AST2600_VGA_CMDQ_ENDP		0x2c
> +#define XDMA_AST2600_VGA_CMDQ_WRITEP		0x30
> +#define XDMA_AST2600_VGA_CMDQ_READP		0x34
> +#define XDMA_AST2600_CTRL			0x38
> +#define  XDMA_AST2600_CTRL_US_COMP		 BIT(16)
> +#define  XDMA_AST2600_CTRL_DS_COMP		 BIT(17)
> +#define  XDMA_AST2600_CTRL_DS_DIRTY		 BIT(18)
> +#define  XDMA_AST2600_CTRL_DS_SIZE_256		 BIT(20)
> +#define XDMA_AST2600_STATUS			0x3c
> +#define  XDMA_AST2600_STATUS_US_COMP		 BIT(16)
> +#define  XDMA_AST2600_STATUS_DS_COMP		 BIT(17)
> +#define  XDMA_AST2600_STATUS_DS_DIRTY		 BIT(18)
> +#define XDMA_AST2600_INPRG_DS_CMD00		0x40
> +#define XDMA_AST2600_INPRG_DS_CMD01		0x44
> +#define XDMA_AST2600_INPRG_DS_CMD10		0x48
> +#define XDMA_AST2600_INPRG_DS_CMD11		0x4c
> +#define XDMA_AST2600_INPRG_DS_CMD20		0x50
> +#define XDMA_AST2600_INPRG_DS_CMD21		0x54
> +#define XDMA_AST2600_INPRG_US_CMD00		0x60
> +#define XDMA_AST2600_INPRG_US_CMD01		0x64
> +#define XDMA_AST2600_INPRG_US_CMD10		0x68
> +#define XDMA_AST2600_INPRG_US_CMD11		0x6c
> +#define XDMA_AST2600_INPRG_US_CMD20		0x70
> +#define XDMA_AST2600_INPRG_US_CMD21		0x74
> +
> +struct aspeed_xdma_cmd {
> +	u64 host_addr;
> +	u64 pitch;
> +	u64 cmd;
> +	u64 reserved;
> +};
> +
> +struct aspeed_xdma_regs {
> +	u8 bmc_cmdq_addr;
> +	u8 bmc_cmdq_endp;
> +	u8 bmc_cmdq_writep;
> +	u8 bmc_cmdq_readp;
> +	u8 control;
> +	u8 status;
> +};
> +
> +struct aspeed_xdma_status_bits {
> +	u32 us_comp;
> +	u32 ds_comp;
> +	u32 ds_dirty;
> +};
> +
> +struct aspeed_xdma;
> +
> +struct aspeed_xdma_chip {
> +	u32 control;
> +	u32 scu_bmc_class;
> +	u32 scu_pcie_conf;
> +	u32 sdmc_remap;
> +	unsigned int queue_entry_size;
> +	struct aspeed_xdma_regs regs;
> +	struct aspeed_xdma_status_bits status_bits;
> +	unsigned int (*set_cmd)(struct aspeed_xdma *ctx,
> +				struct aspeed_xdma_op *op, u32 bmc_addr);
> +};
> +
> +struct aspeed_xdma_client;
> +
> +struct aspeed_xdma {
> +	const struct aspeed_xdma_chip *chip;
> +
> +	struct device *dev;
> +	void __iomem *base;
> +	struct clk *clock;
> +	struct reset_control *reset;
> +
> +	struct aspeed_xdma_client *current_client;
> +
> +	/* start_lock protects cmd_idx, cmdq, and the state of the engine */
> +	struct mutex start_lock;
> +	void *cmdq;

Can this not be typed as `struct aspeed_xdma_cmd *cmdq`?

> +	bool in_reset;

Bit of a nit, but can we move in_reset under reset_lock below?

> +	bool upstream;
> +	unsigned int cmd_idx;
> +
> +	/* reset_lock protects in_reset and the reset state of the engine */
> +	spinlock_t reset_lock;
> +
> +	wait_queue_head_t wait;
> +	struct work_struct reset_work;
> +
> +	u32 vga_phys;
> +	u32 vga_size;
> +	void __iomem *vga_virt;
> +	dma_addr_t cmdq_vga_phys;
> +	void *cmdq_vga_virt;
> +	struct gen_pool *vga_pool;

This shouldn't have anything to do with VGA specifically, we could
theoretically use any other piece of reserved memory (in the right
security context).

> +};
> +
> +struct aspeed_xdma_client {
> +	struct aspeed_xdma *ctx;
> +
> +	bool error;
> +	bool in_progress;
> +	void *virt;
> +	dma_addr_t phys;
> +	u32 size;
> +};
> +
> +static u32 aspeed_xdma_readl(struct aspeed_xdma *ctx, u8 reg)
> +{
> +	u32 v = readl(ctx->base + reg);
> +
> +	dev_dbg(ctx->dev, "read %02x[%08x]\n", reg, v);
> +	return v;
> +}
> +
> +static void aspeed_xdma_writel(struct aspeed_xdma *ctx, u8 reg, u32 val)
> +{
> +	writel(val, ctx->base + reg);
> +	dev_dbg(ctx->dev, "write %02x[%08x]\n", reg, val);
> +}
> +
> +static void aspeed_xdma_init_eng(struct aspeed_xdma *ctx)
> +{
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.bmc_cmdq_endp,
> +			   ctx->chip->queue_entry_size * XDMA_NUM_CMDS);
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.bmc_cmdq_readp,
> +			   XDMA_BMC_CMDQ_READP_RESET);
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.bmc_cmdq_writep, 0);
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.control, ctx->chip->control);
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.bmc_cmdq_addr,
> +			   ctx->cmdq_vga_phys);
> +
> +	ctx->cmd_idx = 0;
> +	ctx->current_client = NULL;
> +}
> +
> +static unsigned int aspeed_xdma_ast2500_set_cmd(struct aspeed_xdma *ctx,
> +						struct aspeed_xdma_op *op,
> +						u32 bmc_addr)
> +{
> +	u64 cmd = XDMA_CMD_AST2500_CMD_IRQ_EN | XDMA_CMD_AST2500_CMD_IRQ_BMC |
> +		XDMA_CMD_AST2500_CMD_ID;
> +	u64 cmd_pitch = (op->direction ? XDMA_CMD_AST2500_PITCH_UPSTREAM : 0) |
> +		XDMA_CMD_AST2500_PITCH_ID;
> +	unsigned int line_size;
> +	unsigned int nidx = (ctx->cmd_idx + 1) % XDMA_NUM_CMDS;
> +	unsigned int line_no = 1;
> +	unsigned int pitch = 1;
> +	struct aspeed_xdma_cmd *ncmd =
> +		&(((struct aspeed_xdma_cmd *)ctx->cmdq)[ctx->cmd_idx]);

Changing ctx->cmdq away from a `void *` would help out here.

> +
> +	dev_dbg(ctx->dev, "xdma %s ast2500: bmc[%08x] len[%08x] host[%08x]\n",
> +		op->direction ? "upstream" : "downstream", bmc_addr, op->len,
> +		(u32)op->host_addr);
> +
> +	if (op->len > XDMA_CMD_AST2500_CMD_LINE_SIZE) {
> +		unsigned int rem;
> +		unsigned int total;
> +
> +		line_no = op->len / XDMA_CMD_AST2500_CMD_LINE_SIZE;
> +		total = XDMA_CMD_AST2500_CMD_LINE_SIZE * line_no;
> +		rem = (op->len - total) >>
> +			XDMA_CMD_AST2500_CMD_LINE_SIZE_SHIFT;
> +		line_size = XDMA_CMD_AST2500_CMD_LINE_SIZE;
> +		pitch = line_size >> XDMA_CMD_AST2500_PITCH_SHIFT;
> +		line_size >>= XDMA_CMD_AST2500_CMD_LINE_SIZE_SHIFT;
> +
> +		if (rem) {
> +			u32 rbmc = bmc_addr + total;
> +			struct aspeed_xdma_cmd *rcmd =
> +				&(((struct aspeed_xdma_cmd *)ctx->cmdq)[nidx]);
> +
> +			rcmd->host_addr = op->host_addr + (u64)total;
> +			rcmd->pitch = cmd_pitch |
> +				((u64)rbmc & XDMA_CMD_AST2500_PITCH_ADDR) |
> +				FIELD_PREP(XDMA_CMD_AST2500_PITCH_HOST, 1) |
> +				FIELD_PREP(XDMA_CMD_AST2500_PITCH_BMC, 1);
> +			rcmd->cmd = cmd |
> +				FIELD_PREP(XDMA_CMD_AST2500_CMD_LINE_NO, 1) |
> +				FIELD_PREP(XDMA_CMD_AST2500_CMD_LINE_SIZE,
> +					   rem);
> +
> +			print_hex_dump_debug("xdma rem ", DUMP_PREFIX_OFFSET,
> +					     16, 1, rcmd, sizeof(*rcmd), true);
> +
> +			cmd &= ~(XDMA_CMD_AST2500_CMD_IRQ_EN |
> +				 XDMA_CMD_AST2500_CMD_IRQ_BMC);
> +
> +			nidx = (nidx + 1) % XDMA_NUM_CMDS;
> +		}
> +	} else {
> +		line_size = op->len >> XDMA_CMD_AST2500_CMD_LINE_SIZE_SHIFT;
> +	}
> +
> +	ncmd->host_addr = op->host_addr;
> +	ncmd->pitch = cmd_pitch |
> +		((u64)bmc_addr & XDMA_CMD_AST2500_PITCH_ADDR) |
> +		FIELD_PREP(XDMA_CMD_AST2500_PITCH_HOST, pitch) |
> +		FIELD_PREP(XDMA_CMD_AST2500_PITCH_BMC, pitch);
> +	ncmd->cmd = cmd | FIELD_PREP(XDMA_CMD_AST2500_CMD_LINE_NO, line_no) |
> +		FIELD_PREP(XDMA_CMD_AST2500_CMD_LINE_SIZE, line_size);
> +
> +	print_hex_dump_debug("xdma cmd ", DUMP_PREFIX_OFFSET, 16, 1, ncmd,
> +			     sizeof(*ncmd), true);
> +
> +	return nidx;
> +}
> +
> +static unsigned int aspeed_xdma_ast2600_set_cmd(struct aspeed_xdma *ctx,
> +						struct aspeed_xdma_op *op,
> +						u32 bmc_addr)
> +{
> +	u64 cmd = XDMA_CMD_AST2600_CMD_IRQ_BMC |
> +		(op->direction ? XDMA_CMD_AST2600_CMD_UPSTREAM : 0);
> +	unsigned int line_size;
> +	unsigned int nidx = (ctx->cmd_idx + 1) % XDMA_NUM_CMDS;
> +	unsigned int line_no = 1;
> +	unsigned int pitch = 1;
> +	struct aspeed_xdma_cmd *ncmd =
> +		&(((struct aspeed_xdma_cmd *)ctx->cmdq)[ctx->cmd_idx]);
> +
> +	if ((op->host_addr + op->len) & 0xffffffff00000000ULL)

Do we know that this won't wrap?

> +		cmd |= XDMA_CMD_AST2600_CMD_64_EN;
> +
> +	dev_dbg(ctx->dev, "xdma %s ast2600: bmc[%08x] len[%08x] "
> +		"host[%016llx]\n", op->direction ? "upstream" : "downstream",
> +		bmc_addr, op->len, op->host_addr);
> +
> +	if (op->len > XDMA_CMD_AST2600_CMD_LINE_SIZE) {
> +		unsigned int rem;
> +		unsigned int total;
> +
> +		line_no = op->len / XDMA_CMD_AST2600_CMD_MULTILINE_SIZE;
> +		total = XDMA_CMD_AST2600_CMD_MULTILINE_SIZE * line_no;
> +		rem = op->len - total;
> +		line_size = XDMA_CMD_AST2600_CMD_MULTILINE_SIZE;
> +		pitch = line_size;
> +
> +		if (rem) {
> +			u32 rbmc = bmc_addr + total;
> +			struct aspeed_xdma_cmd *rcmd =
> +				&(((struct aspeed_xdma_cmd *)ctx->cmdq)[nidx]);
> +
> +			rcmd->host_addr = op->host_addr + (u64)total;
> +			rcmd->pitch =
> +				((u64)rbmc & XDMA_CMD_AST2600_PITCH_ADDR) |
> +				FIELD_PREP(XDMA_CMD_AST2600_PITCH_HOST, 1) |
> +				FIELD_PREP(XDMA_CMD_AST2600_PITCH_BMC, 1);
> +			rcmd->cmd = cmd |
> +				FIELD_PREP(XDMA_CMD_AST2600_CMD_LINE_NO, 1) |
> +				FIELD_PREP(XDMA_CMD_AST2600_CMD_LINE_SIZE,
> +					   rem);
> +
> +			print_hex_dump_debug("xdma rem ", DUMP_PREFIX_OFFSET,
> +					     16, 1, rcmd, sizeof(*rcmd), true);
> +
> +			cmd &= ~XDMA_CMD_AST2600_CMD_IRQ_BMC;
> +
> +			nidx = (nidx + 1) % XDMA_NUM_CMDS;
> +		}
> +	} else {
> +		line_size = op->len;
> +	}
> +
> +	ncmd->host_addr = op->host_addr;
> +	ncmd->pitch = ((u64)bmc_addr & XDMA_CMD_AST2600_PITCH_ADDR) |
> +		FIELD_PREP(XDMA_CMD_AST2600_PITCH_HOST, pitch) |
> +		FIELD_PREP(XDMA_CMD_AST2600_PITCH_BMC, pitch);
> +	ncmd->cmd = cmd | FIELD_PREP(XDMA_CMD_AST2600_CMD_LINE_NO, line_no) |
> +		FIELD_PREP(XDMA_CMD_AST2600_CMD_LINE_SIZE, line_size);
> +
> +	print_hex_dump_debug("xdma cmd ", DUMP_PREFIX_OFFSET, 16, 1, ncmd,
> +			     sizeof(*ncmd), true);
> +
> +	return nidx;
> +}
> +
> +static void aspeed_xdma_start(struct aspeed_xdma *ctx,
> +			      struct aspeed_xdma_op *op, u32 bmc_addr,
> +			      struct aspeed_xdma_client *client)
> +{
> +	unsigned int nidx;
> +
> +	mutex_lock(&ctx->start_lock);
> +
> +	nidx = ctx->chip->set_cmd(ctx, op, bmc_addr);
> +	memcpy(ctx->cmdq_vga_virt, ctx->cmdq, XDMA_CMDQ_SIZE);
> +
> +	client->in_progress = true;
> +	ctx->current_client = client;
> +	ctx->upstream = op->direction ? true : false;

Could get away without the branch, just assign directly or use the !! idiom
(you're already turning op->direction into a boolean by using it as the
condition).

> +
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.bmc_cmdq_writep,
> +			   nidx * ctx->chip->queue_entry_size);
> +
> +	ctx->cmd_idx = nidx;
> +
> +	mutex_unlock(&ctx->start_lock);
> +}
> +
> +static void aspeed_xdma_done(struct aspeed_xdma *ctx, bool error)
> +{
> +	if (ctx->current_client) {
> +		ctx->current_client->error = error;
> +		ctx->current_client->in_progress = false;
> +		ctx->current_client = NULL;

You need to take start_lock before writing these members to ensure the
writes are not reordered across acquisition of start_lock in
aspeed_xdma_start() above, unless there's some other guarantee of that?

I see that aspeed_xdma_done() is called from aspeed_xdma_irq() below,
so maybe we need a different locking strategy or punt aspeed_xdma_done()
to a workqueue, as start_lock is a mutex.

> +	}
> +
> +	wake_up_interruptible_all(&ctx->wait);
> +}
> +
> +static irqreturn_t aspeed_xdma_irq(int irq, void *arg)
> +{
> +	struct aspeed_xdma *ctx = arg;
> +	u32 status = aspeed_xdma_readl(ctx, ctx->chip->regs.status);
> +
> +	if (status & ctx->chip->status_bits.ds_dirty) {
> +		aspeed_xdma_done(ctx, true);
> +	} else {
> +		if (status & ctx->chip->status_bits.us_comp) {
> +			if (ctx->upstream)
> +				aspeed_xdma_done(ctx, false);
> +		}
> +
> +		if (status & ctx->chip->status_bits.ds_comp) {
> +			if (!ctx->upstream)
> +				aspeed_xdma_done(ctx, false);
> +		}
> +	}
> +
> +	aspeed_xdma_writel(ctx, ctx->chip->regs.status, status);
> +
> +	return IRQ_HANDLED;
> +}
> +
> +static void aspeed_xdma_reset(struct aspeed_xdma *ctx)
> +{
> +	reset_control_assert(ctx->reset);
> +	msleep(XDMA_RESET_TIME_MS);
> +
> +	reset_control_deassert(ctx->reset);
> +	msleep(XDMA_RESET_TIME_MS);
> +
> +	aspeed_xdma_init_eng(ctx);
> +
> +	ctx->in_reset = false;
> +	aspeed_xdma_done(ctx, true);
> +}
> +
> +static void aspeed_xdma_reset_work(struct work_struct *work)
> +{
> +	struct aspeed_xdma *ctx = container_of(work, struct aspeed_xdma,
> +					       reset_work);
> +
> +	/*
> +	 * Lock to make sure operations aren't started while the engine is
> +	 * in reset.
> +	 */
> +	mutex_lock(&ctx->start_lock);
> +
> +	aspeed_xdma_reset(ctx);
> +
> +	mutex_unlock(&ctx->start_lock);
> +}
> +
> +static irqreturn_t aspeed_xdma_pcie_irq(int irq, void *arg)
> +{
> +	unsigned long flags;
> +	struct aspeed_xdma *ctx = arg;
> +
> +	dev_dbg(ctx->dev, "pcie reset\n");
> +
> +	spin_lock_irqsave(&ctx->reset_lock, flags);
> +	if (ctx->in_reset) {
> +		spin_unlock_irqrestore(&ctx->reset_lock, flags);
> +		return IRQ_HANDLED;
> +	}
> +
> +	ctx->in_reset = true;
> +	spin_unlock_irqrestore(&ctx->reset_lock, flags);
> +
> +	schedule_work(&ctx->reset_work);
> +	return IRQ_HANDLED;
> +}
> +
> +static int aspeed_xdma_probe(struct platform_device *pdev)
> +{
> +	int irq;
> +	int pcie_irq;
> +	int rc;
> +	u32 vgamem[2];
> +	struct regmap *scu;
> +	struct regmap *sdmc;
> +	struct aspeed_xdma *ctx;
> +	struct device *dev = &pdev->dev;
> +	const void *md = of_device_get_match_data(dev);

So close to reverse christmas tree :)

> +
> +	if (!md)
> +		return -ENODEV;
> +
> +	ctx = devm_kzalloc(dev, sizeof(*ctx), GFP_KERNEL);
> +	if (!ctx)
> +		return -ENOMEM;
> +
> +	ctx->chip = md;
> +	ctx->dev = dev;
> +	platform_set_drvdata(pdev, ctx);
> +	mutex_init(&ctx->start_lock);
> +	INIT_WORK(&ctx->reset_work, aspeed_xdma_reset_work);
> +	spin_lock_init(&ctx->reset_lock);
> +	init_waitqueue_head(&ctx->wait);
> +
> +	ctx->base = devm_platform_ioremap_resource(pdev, 0);
> +	if (IS_ERR(ctx->base)) {
> +		dev_err(dev, "Failed to map registers.\n");
> +		return PTR_ERR(ctx->base);
> +	}
> +
> +	irq = platform_get_irq(pdev, 0);
> +	if (irq < 0) {
> +		dev_err(dev, "Unable to find IRQ.\n");
> +		return irq;
> +	}
> +
> +	rc = devm_request_irq(dev, irq, aspeed_xdma_irq, IRQF_SHARED,
> +			      DEVICE_NAME, ctx);
> +	if (rc < 0) {
> +		dev_err(dev, "Failed to request IRQ %d.\n", irq);
> +		return rc;
> +	}
> +
> +	ctx->clock = devm_clk_get(dev, NULL);
> +	if (IS_ERR(ctx->clock)) {
> +		dev_err(dev, "Failed to request clock.\n");
> +		return PTR_ERR(ctx->clock);
> +	}
> +
> +	ctx->reset = devm_reset_control_get_exclusive(dev, NULL);
> +	if (IS_ERR(ctx->reset)) {
> +		dev_err(dev, "Failed to request reset control.\n");
> +		return PTR_ERR(ctx->reset);
> +	}
> +
> +	ctx->cmdq = devm_kzalloc(ctx->dev, XDMA_CMDQ_SIZE, GFP_KERNEL);
> +	if (!ctx->cmdq) {
> +		dev_err(dev, "Failed to allocate command queue.\n");
> +		return -ENOMEM;
> +	}
> +
> +	ctx->vga_pool = devm_gen_pool_create(dev, ilog2(PAGE_SIZE), -1, NULL);
> +	if (!ctx->vga_pool) {
> +		dev_err(dev, "Failed to setup genalloc pool.\n");
> +		return -ENOMEM;
> +	}
> +
> +	rc = of_property_read_u32_array(dev->of_node, "vga-mem", vgamem, 2);

As mentioned, this could be any reserved memory range. Also can't we get it as
a resource rather than parsing a u32 array? Not sure if there's an advantage
but it feels like a better representation.

> +	if (rc) {
> +		dev_err(dev, "Unable to get VGA memory space.\n");
> +		return rc;
> +	}
> +
> +	ctx->vga_phys = vgamem[0];
> +	ctx->vga_size = vgamem[1];
> +
> +	ctx->vga_virt = devm_ioremap(dev, ctx->vga_phys, ctx->vga_size);
> +	if (IS_ERR(ctx->vga_virt)) {
> +		dev_err(dev, "Failed to map VGA memory space.\n");
> +		return PTR_ERR(ctx->vga_virt);
> +	}
> +
> +	rc = gen_pool_add_virt(ctx->vga_pool, (unsigned long)ctx->vga_virt,
> +			       ctx->vga_phys, ctx->vga_size, -1);
> +	if (rc) {
> +		dev_err(ctx->dev, "Failed to add memory to genalloc pool.\n");
> +		return rc;
> +	}
> +
> +	sdmc = syscon_regmap_lookup_by_phandle(dev->of_node, "sdmc");
> +	if (IS_ERR(sdmc)) {
> +		dev_err(dev, "Unable to configure memory controller.\n");
> +		return PTR_ERR(sdmc);
> +	}
> +
> +	regmap_update_bits(sdmc, SDMC_REMAP, ctx->chip->sdmc_remap,
> +			   ctx->chip->sdmc_remap);

I disagree with doing this. As mentioned on the bindings it should be up to
the platform integrator to ensure that this is configured appropriately.

> +
> +	scu = syscon_regmap_lookup_by_phandle(dev->of_node, "scu");
> +	if (!IS_ERR(scu)) {
> +		u32 selection;
> +		bool pcie_device_bmc = true;
> +		const u32 bmc = SCU_PCIE_CONF_BMC_EN |
> +			SCU_PCIE_CONF_BMC_EN_MSI | SCU_PCIE_CONF_BMC_EN_IRQ |
> +			SCU_PCIE_CONF_BMC_EN_DMA;
> +		const u32 vga = SCU_PCIE_CONF_VGA_EN |
> +			SCU_PCIE_CONF_VGA_EN_MSI | SCU_PCIE_CONF_VGA_EN_IRQ |
> +			SCU_PCIE_CONF_VGA_EN_DMA;
> +		const char *pcie = NULL;
> +
> +		if (!of_property_read_string(dev->of_node, "pcie-device",
> +					     &pcie)) {
> +			if (!strcmp(pcie, "vga"))
> +				pcie_device_bmc = false;

You need to return an error here if the string is not one of "vga" or "bmc". This
just assumes that anything that isn't "vga" is "bmc".

> +		}
> +
> +		if (pcie_device_bmc) {
> +			selection = bmc;
> +			regmap_write(scu, ctx->chip->scu_bmc_class,
> +				     SCU_BMC_CLASS_REV_XDMA);
> +		} else {
> +			selection = vga;
> +		}
> +
> +		regmap_update_bits(scu, ctx->chip->scu_pcie_conf, bmc | vga,
> +				   selection);
> +	} else {
> +		dev_warn(dev, "Unable to configure PCIe; continuing.\n");
> +	}
> +
> +	rc = clk_prepare_enable(ctx->clock);
> +	if (rc) {
> +		dev_err(dev, "Failed to enable the clock.\n");
> +		return rc;
> +	}
> +	msleep(XDMA_RESET_TIME_MS);
> +
> +	rc = reset_control_deassert(ctx->reset);
> +	if (rc) {
> +		clk_disable_unprepare(ctx->clock);
> +
> +		dev_err(dev, "Failed to clear the reset.\n");
> +		return rc;
> +	}
> +	msleep(XDMA_RESET_TIME_MS);
> +
> +	ctx->cmdq_vga_virt = gen_pool_dma_alloc(ctx->vga_pool, XDMA_CMDQ_SIZE,
> +						&ctx->cmdq_vga_phys);
> +	if (!ctx->cmdq_vga_virt) {
> +		dev_err(ctx->dev, "Failed to genalloc cmdq.\n");
> +
> +		reset_control_assert(ctx->reset);
> +		clk_disable_unprepare(ctx->clock);
> +		return -ENOMEM;
> +	}
> +
> +	aspeed_xdma_init_eng(ctx);
> +
> +	/*
> +	 * This interrupt could fire immediately so only request it once the
> +	 * engine and driver are initialized.
> +	 */
> +	pcie_irq = platform_get_irq(pdev, 1);
> +	if (pcie_irq < 0) {
> +		dev_warn(dev, "Unable to find PCI-E IRQ.\n");
> +	} else {
> +		rc = devm_request_irq(dev, pcie_irq, aspeed_xdma_pcie_irq,
> +				      IRQF_SHARED, DEVICE_NAME, ctx);
> +		if (rc < 0)
> +			dev_warn(dev, "Failed to request PCI-E IRQ %d.\n", rc);
> +	}
> +
> +	return 0;
> +}
> +
> +static int aspeed_xdma_remove(struct platform_device *pdev)
> +{
> +	struct aspeed_xdma *ctx = platform_get_drvdata(pdev);
> +
> +	gen_pool_free(ctx->vga_pool, (unsigned long)ctx->cmdq_vga_virt,
> +		      XDMA_CMDQ_SIZE);
> +
> +	reset_control_assert(ctx->reset);
> +	clk_disable_unprepare(ctx->clock);
> +
> +	return 0;
> +}
> +
> +static const struct aspeed_xdma_chip aspeed_ast2500_xdma_chip = {
> +	.control = XDMA_AST2500_CTRL_US_COMP | XDMA_AST2500_CTRL_DS_COMP |
> +		XDMA_AST2500_CTRL_DS_DIRTY | XDMA_AST2500_CTRL_DS_SIZE_256 |
> +		XDMA_AST2500_CTRL_DS_TIMEOUT | XDMA_AST2500_CTRL_DS_CHECK_ID,
> +	.scu_bmc_class = SCU_AST2500_BMC_CLASS_REV,
> +	.scu_pcie_conf = SCU_AST2500_PCIE_CONF,
> +	.sdmc_remap = SDMC_AST2500_REMAP_PCIE | SDMC_AST2500_REMAP_XDMA,
> +	.queue_entry_size = XDMA_AST2500_QUEUE_ENTRY_SIZE,
> +	.regs = {
> +		.bmc_cmdq_addr = XDMA_AST2500_BMC_CMDQ_ADDR,
> +		.bmc_cmdq_endp = XDMA_AST2500_BMC_CMDQ_ENDP,
> +		.bmc_cmdq_writep = XDMA_AST2500_BMC_CMDQ_WRITEP,
> +		.bmc_cmdq_readp = XDMA_AST2500_BMC_CMDQ_READP,
> +		.control = XDMA_AST2500_CTRL,
> +		.status = XDMA_AST2500_STATUS,
> +	},
> +	.status_bits = {
> +		.us_comp = XDMA_AST2500_STATUS_US_COMP,
> +		.ds_comp = XDMA_AST2500_STATUS_DS_COMP,
> +		.ds_dirty = XDMA_AST2500_STATUS_DS_DIRTY,
> +	},
> +	.set_cmd = aspeed_xdma_ast2500_set_cmd,
> +};
> +
> +static const struct aspeed_xdma_chip aspeed_ast2600_xdma_chip = {
> +	.control = XDMA_AST2600_CTRL_US_COMP | XDMA_AST2600_CTRL_DS_COMP |
> +		XDMA_AST2600_CTRL_DS_DIRTY | XDMA_AST2600_CTRL_DS_SIZE_256,
> +	.scu_bmc_class = SCU_AST2600_BMC_CLASS_REV,
> +	.scu_pcie_conf = SCU_AST2600_PCIE_CONF,
> +	.sdmc_remap = SDMC_AST2600_REMAP_XDMA,
> +	.queue_entry_size = XDMA_AST2600_QUEUE_ENTRY_SIZE,
> +	.regs = {
> +		.bmc_cmdq_addr = XDMA_AST2600_BMC_CMDQ_ADDR,
> +		.bmc_cmdq_endp = XDMA_AST2600_BMC_CMDQ_ENDP,
> +		.bmc_cmdq_writep = XDMA_AST2600_BMC_CMDQ_WRITEP,
> +		.bmc_cmdq_readp = XDMA_AST2600_BMC_CMDQ_READP,
> +		.control = XDMA_AST2600_CTRL,
> +		.status = XDMA_AST2600_STATUS,
> +	},
> +	.status_bits = {
> +		.us_comp = XDMA_AST2600_STATUS_US_COMP,
> +		.ds_comp = XDMA_AST2600_STATUS_DS_COMP,
> +		.ds_dirty = XDMA_AST2600_STATUS_DS_DIRTY,
> +	},
> +	.set_cmd = aspeed_xdma_ast2600_set_cmd,
> +};
> +
> +static const struct of_device_id aspeed_xdma_match[] = {
> +	{
> +		.compatible = "aspeed,ast2500-xdma",
> +		.data = &aspeed_ast2500_xdma_chip,
> +	},
> +	{
> +		.compatible = "aspeed,ast2600-xdma",
> +		.data = &aspeed_ast2600_xdma_chip,
> +	},
> +	{ },
> +};

Thanks, I think this representation is a lot cleaner.

> +
> +static struct platform_driver aspeed_xdma_driver = {
> +	.probe = aspeed_xdma_probe,
> +	.remove = aspeed_xdma_remove,
> +	.driver = {
> +		.name = DEVICE_NAME,
> +		.of_match_table = aspeed_xdma_match,
> +	},
> +};
> +
> +module_platform_driver(aspeed_xdma_driver);
> +
> +MODULE_AUTHOR("Eddie James");
> +MODULE_DESCRIPTION("Aspeed XDMA Engine Driver");
> +MODULE_LICENSE("GPL v2");
> diff --git a/include/uapi/linux/aspeed-xdma.h b/include/uapi/linux/aspeed-xdma.h
> new file mode 100644
> index 0000000..f39f38e
> --- /dev/null
> +++ b/include/uapi/linux/aspeed-xdma.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */
> +/* Copyright IBM Corp 2019 */
> +
> +#ifndef _UAPI_LINUX_ASPEED_XDMA_H_
> +#define _UAPI_LINUX_ASPEED_XDMA_H_
> +
> +#include <linux/types.h>
> +
> +/*
> + * aspeed_xdma_direction
> + *
> + * ASPEED_XDMA_DIRECTION_DOWNSTREAM: transfers data from the host to the BMC
> + *
> + * ASPEED_XDMA_DIRECTION_UPSTREAM: transfers data from the BMC to the host
> + *
> + * ASPEED_XDMA_DIRECTION_RESET: resets the XDMA engine
> + */
> +enum aspeed_xdma_direction {
> +	ASPEED_XDMA_DIRECTION_DOWNSTREAM = 0,
> +	ASPEED_XDMA_DIRECTION_UPSTREAM,
> +	ASPEED_XDMA_DIRECTION_RESET,

I still think having a reset action as part of the direction is a bit funky. Can you maybe
put that in a separate patch so we can debate it later?

> +};
> +
> +/*
> + * aspeed_xdma_op
> + *
> + * host_addr: the DMA address on the host side, typically configured by PCI
> + *            subsystem
> + *
> + * len: the size of the transfer in bytes
> + *
> + * direction: an enumerator indicating the direction of the DMA operation; see
> + *            enum aspeed_xdma_direction
> + */
> +struct aspeed_xdma_op {
> +	__u64 host_addr;
> +	__u32 len;
> +	__u32 direction;
> +};
> +
> +#endif /* _UAPI_LINUX_ASPEED_XDMA_H_ */
> -- 
> 1.8.3.1
> 
>