[PATCH 3/3] dmaengine: ptdma: Utilize the AE4DMA engine's multi-queue functionality

Basavaraj Natikar <Basavaraj.Natikar@xxxxxxx> · Mon, 3 Feb 2025 21:55:11 +0530

As AE4DMA offers multi-channel functionality compared to PTDMA’s single
queue, utilize multi-queue, which supports higher speeds than PTDMA, to
achieve higher performance using the AE4DMA workqueue based mechanism.

Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support multi-channel and version")
Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@xxxxxxx>
---
 drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
 drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 ++++++++++++++++++++++++-
 2 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/amd/ae4dma/ae4dma.h b/drivers/dma/amd/ae4dma/ae4dma.h
index 265c5d436008..57f6048726bb 100644
--- a/drivers/dma/amd/ae4dma/ae4dma.h
+++ b/drivers/dma/amd/ae4dma/ae4dma.h
@@ -37,6 +37,8 @@
 #define AE4_DMA_VERSION			4
 #define CMD_AE4_DESC_DW0_VAL		2
 
+#define AE4_TIME_OUT			5000
+
 struct ae4_msix {
 	int msix_count;
 	struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
index 35c84ec9608b..715ac3ae067b 100644
--- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
+++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
@@ -198,8 +198,10 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 {
 	struct dma_async_tx_descriptor *tx_desc;
 	struct virt_dma_desc *vd;
+	struct pt_device *pt;
 	unsigned long flags;
 
+	pt = chan->pt;
 	/* Loop over descriptors until one is found with commands */
 	do {
 		if (desc) {
@@ -217,7 +219,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 
 		spin_lock_irqsave(&chan->vc.lock, flags);
 
-		if (desc) {
+		if (pt->ver != AE4_DMA_VERSION && desc) {
 			if (desc->status != DMA_COMPLETE) {
 				if (desc->status != DMA_ERROR)
 					desc->status = DMA_COMPLETE;
@@ -235,7 +237,7 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 
 		spin_unlock_irqrestore(&chan->vc.lock, flags);
 
-		if (tx_desc) {
+		if (pt->ver != AE4_DMA_VERSION && tx_desc) {
 			dmaengine_desc_get_callback_invoke(tx_desc, NULL);
 			dma_run_dependencies(tx_desc);
 			vchan_vdesc_fini(vd);
@@ -245,11 +247,25 @@ static struct pt_dma_desc *pt_handle_active_desc(struct pt_dma_chan *chan,
 	return NULL;
 }
 
+static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
+{
+	u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
+	u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
+
+	if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN)  >= (MAX_CMD_QLEN - 1))
+		return true;
+
+	return false;
+}
+
 static void pt_cmd_callback(void *data, int err)
 {
 	struct pt_dma_desc *desc = data;
+	struct ae4_cmd_queue *ae4cmd_q;
 	struct dma_chan *dma_chan;
 	struct pt_dma_chan *chan;
+	struct ae4_device *ae4;
+	struct pt_device *pt;
 	int ret;
 
 	if (err == -EINPROGRESS)
@@ -257,11 +273,32 @@ static void pt_cmd_callback(void *data, int err)
 
 	dma_chan = desc->vd.tx.chan;
 	chan = to_pt_chan(dma_chan);
+	pt = chan->pt;
 
 	if (err)
 		desc->status = DMA_ERROR;
 
 	while (true) {
+		if (pt->ver == AE4_DMA_VERSION) {
+			ae4 = container_of(pt, struct ae4_device, pt);
+			ae4cmd_q = &ae4->ae4cmd_q[chan->id];
+
+			if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
+			    ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
+				wake_up(&ae4cmd_q->q_w);
+
+				if (wait_for_completion_timeout(&ae4cmd_q->cmp,
+								msecs_to_jiffies(AE4_TIME_OUT))
+								== 0) {
+					dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
+					break;
+				}
+
+				reinit_completion(&ae4cmd_q->cmp);
+				continue;
+			}
+		}
+
 		/* Check for DMA descriptor completion */
 		desc = pt_handle_active_desc(chan, desc);
 
@@ -296,6 +333,49 @@ static struct pt_dma_desc *pt_alloc_dma_desc(struct pt_dma_chan *chan,
 	return desc;
 }
 
+static void pt_cmd_callback_work(void *data, int err)
+{
+	struct dma_async_tx_descriptor *tx_desc;
+	struct pt_dma_desc *desc = data;
+	struct dma_chan *dma_chan;
+	struct virt_dma_desc *vd;
+	struct pt_dma_chan *chan;
+	unsigned long flags;
+
+	dma_chan = desc->vd.tx.chan;
+	chan = to_pt_chan(dma_chan);
+
+	if (err == -EINPROGRESS)
+		return;
+
+	tx_desc = &desc->vd.tx;
+	vd = &desc->vd;
+
+	if (err)
+		desc->status = DMA_ERROR;
+
+	spin_lock_irqsave(&chan->vc.lock, flags);
+	if (desc) {
+		if (desc->status != DMA_COMPLETE) {
+			if (desc->status != DMA_ERROR)
+				desc->status = DMA_COMPLETE;
+
+			dma_cookie_complete(tx_desc);
+			dma_descriptor_unmap(tx_desc);
+		} else {
+			tx_desc = NULL;
+		}
+	}
+	spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+	if (tx_desc) {
+		dmaengine_desc_get_callback_invoke(tx_desc, NULL);
+		dma_run_dependencies(tx_desc);
+		list_del(&desc->vd.node);
+		vchan_vdesc_fini(vd);
+	}
+}
+
 static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
 					  dma_addr_t dst,
 					  dma_addr_t src,
@@ -327,6 +407,7 @@ static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
 	desc->len = len;
 
 	if (pt->ver == AE4_DMA_VERSION) {
+		pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
 		ae4 = container_of(pt, struct ae4_device, pt);
 		ae4cmd_q = &ae4->ae4cmd_q[chan->id];
 		mutex_lock(&ae4cmd_q->cmd_lock);
@@ -367,13 +448,16 @@ static void pt_issue_pending(struct dma_chan *dma_chan)
 {
 	struct pt_dma_chan *chan = to_pt_chan(dma_chan);
 	struct pt_dma_desc *desc;
+	struct pt_device *pt;
 	unsigned long flags;
 	bool engine_is_idle = true;
 
+	pt = chan->pt;
+
 	spin_lock_irqsave(&chan->vc.lock, flags);
 
 	desc = pt_next_dma_desc(chan);
-	if (desc)
+	if (desc && pt->ver != AE4_DMA_VERSION)
 		engine_is_idle = false;
 
 	vchan_issue_pending(&chan->vc);
-- 
2.25.1