Re: [PATCH 3/3] dmaengine: ptdma: Utilize the AE4DMA engine's multi-queue functionality

Basavaraj Natikar <bnatikar@xxxxxxx> · Mon, 10 Feb 2025 20:07:52 +0530

On 2/10/2025 7:56 PM, Basavaraj Natikar wrote:


On 2/10/2025 7:48 PM, Vinod Koul wrote:
On 03-02-25, 21:55, Basavaraj Natikar wrote:
As AE4DMA offers multi-channel functionality compared to PTDMA’s single
queue, utilize multi-queue, which supports higher speeds than PTDMA, to
achieve higher performance using the AE4DMA workqueue based mechanism.

Fixes: 69a47b16a51b ("dmaengine: ptdma: Extend ptdma to support 
multi-channel and version")
Why is this a fix, again!

Yes, as AE4DMA is much faster with multi-queue. However, sometimes due 
to multi-queue
processing, it takes longer and introduces a timeout for the 
synchronization of
consumers and producers, which helps avoid long waits that could 
eventually cause a hang.

Thanks,
--
Basavaraj


Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@xxxxxxx>
---
  drivers/dma/amd/ae4dma/ae4dma.h         |  2 +
  drivers/dma/amd/ptdma/ptdma-dmaengine.c | 90 
++++++++++++++++++++++++-
  2 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/amd/ae4dma/ae4dma.h 
b/drivers/dma/amd/ae4dma/ae4dma.h
index 265c5d436008..57f6048726bb 100644
--- a/drivers/dma/amd/ae4dma/ae4dma.h
+++ b/drivers/dma/amd/ae4dma/ae4dma.h
@@ -37,6 +37,8 @@
  #define AE4_DMA_VERSION            4
  #define CMD_AE4_DESC_DW0_VAL        2
  +#define AE4_TIME_OUT            5000
+
  struct ae4_msix {
      int msix_count;
      struct msix_entry msix_entry[MAX_AE4_HW_QUEUES];
diff --git a/drivers/dma/amd/ptdma/ptdma-dmaengine.c 
b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
index 35c84ec9608b..715ac3ae067b 100644
--- a/drivers/dma/amd/ptdma/ptdma-dmaengine.c
+++ b/drivers/dma/amd/ptdma/ptdma-dmaengine.c
@@ -198,8 +198,10 @@ static struct pt_dma_desc 
*pt_handle_active_desc(struct pt_dma_chan *chan,
  {
      struct dma_async_tx_descriptor *tx_desc;
      struct virt_dma_desc *vd;
+    struct pt_device *pt;
      unsigned long flags;
  +    pt = chan->pt;
      /* Loop over descriptors until one is found with commands */
      do {
          if (desc) {
@@ -217,7 +219,7 @@ static struct pt_dma_desc 
*pt_handle_active_desc(struct pt_dma_chan *chan,
            spin_lock_irqsave(&chan->vc.lock, flags);
  -        if (desc) {
+        if (pt->ver != AE4_DMA_VERSION && desc) {
              if (desc->status != DMA_COMPLETE) {
                  if (desc->status != DMA_ERROR)
                      desc->status = DMA_COMPLETE;
@@ -235,7 +237,7 @@ static struct pt_dma_desc 
*pt_handle_active_desc(struct pt_dma_chan *chan,
            spin_unlock_irqrestore(&chan->vc.lock, flags);
  -        if (tx_desc) {
+        if (pt->ver != AE4_DMA_VERSION && tx_desc) {
Why should this handling be different for DMA_VERSION?

PTDMA always handles per command based interrupt and it is single queue while
AE4DMA we can submit multiple command each time with multiple as AE4DMA is
much faster with multi-queue.
However, sometimes due to multi-queue processing, it takes longer and introduces a
timeout for the synchronization of consumers and producers,
which helps avoid long waits that could eventually cause a hang.


dmaengine_desc_get_callback_invoke(tx_desc, NULL);
              dma_run_dependencies(tx_desc);
              vchan_vdesc_fini(vd);
@@ -245,11 +247,25 @@ static struct pt_dma_desc 
*pt_handle_active_desc(struct pt_dma_chan *chan,
      return NULL;
  }
  +static inline bool ae4_core_queue_full(struct pt_cmd_queue *cmd_q)
+{
+    u32 front_wi = readl(cmd_q->reg_control + AE4_WR_IDX_OFF);
+    u32 rear_ri = readl(cmd_q->reg_control + AE4_RD_IDX_OFF);
+
+    if (((MAX_CMD_QLEN + front_wi - rear_ri) % MAX_CMD_QLEN) >= 
(MAX_CMD_QLEN - 1))
+        return true;
+
+    return false;
+}
+
  static void pt_cmd_callback(void *data, int err)
  {
      struct pt_dma_desc *desc = data;
+    struct ae4_cmd_queue *ae4cmd_q;
      struct dma_chan *dma_chan;
      struct pt_dma_chan *chan;
+    struct ae4_device *ae4;
+    struct pt_device *pt;
      int ret;
        if (err == -EINPROGRESS)
@@ -257,11 +273,32 @@ static void pt_cmd_callback(void *data, int err)
        dma_chan = desc->vd.tx.chan;
      chan = to_pt_chan(dma_chan);
+    pt = chan->pt;
        if (err)
          desc->status = DMA_ERROR;
        while (true) {
+        if (pt->ver == AE4_DMA_VERSION) {
+            ae4 = container_of(pt, struct ae4_device, pt);
+            ae4cmd_q = &ae4->ae4cmd_q[chan->id];
+
+            if (ae4cmd_q->q_cmd_count >= (CMD_Q_LEN - 1) ||
+                ae4_core_queue_full(&ae4cmd_q->cmd_q)) {
+                wake_up(&ae4cmd_q->q_w);
+
+                if (wait_for_completion_timeout(&ae4cmd_q->cmp,
+ msecs_to_jiffies(AE4_TIME_OUT))
+                                == 0) {
+                    dev_err(pt->dev, "TIMEOUT %d:\n", ae4cmd_q->id);
+                    break;
+                }
+
+                reinit_completion(&ae4cmd_q->cmp);
+                continue;
+            }
+        }
+
          /* Check for DMA descriptor completion */
          desc = pt_handle_active_desc(chan, desc);
  @@ -296,6 +333,49 @@ static struct pt_dma_desc 
*pt_alloc_dma_desc(struct pt_dma_chan *chan,
      return desc;
  }
  +static void pt_cmd_callback_work(void *data, int err)
+{
+    struct dma_async_tx_descriptor *tx_desc;
+    struct pt_dma_desc *desc = data;
+    struct dma_chan *dma_chan;
+    struct virt_dma_desc *vd;
+    struct pt_dma_chan *chan;
+    unsigned long flags;
+
+    dma_chan = desc->vd.tx.chan;
+    chan = to_pt_chan(dma_chan);
+
+    if (err == -EINPROGRESS)
+        return;
+
+    tx_desc = &desc->vd.tx;
+    vd = &desc->vd;
+
+    if (err)
+        desc->status = DMA_ERROR;
+
+    spin_lock_irqsave(&chan->vc.lock, flags);
+    if (desc) {
+        if (desc->status != DMA_COMPLETE) {
+            if (desc->status != DMA_ERROR)
+                desc->status = DMA_COMPLETE;
+
+            dma_cookie_complete(tx_desc);
+            dma_descriptor_unmap(tx_desc);
+        } else {
+            tx_desc = NULL;
+        }
+    }
+    spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+    if (tx_desc) {
+        dmaengine_desc_get_callback_invoke(tx_desc, NULL);
+        dma_run_dependencies(tx_desc);
+        list_del(&desc->vd.node);
+        vchan_vdesc_fini(vd);
+    }
+}
Why do we have callback in driver...?

PTDMA also has similar callback pt_cmd_callback
hence AE4DMA also has callback for the multi-queue command ,
once command is processed to signal upper layer that processing
done for that queue.Thanks,
--
Basavaraj


+
  static struct pt_dma_desc *pt_create_desc(struct dma_chan *dma_chan,
                        dma_addr_t dst,
                        dma_addr_t src,
@@ -327,6 +407,7 @@ static struct pt_dma_desc *pt_create_desc(struct 
dma_chan *dma_chan,
      desc->len = len;
        if (pt->ver == AE4_DMA_VERSION) {
+        pt_cmd->pt_cmd_callback = pt_cmd_callback_work;
          ae4 = container_of(pt, struct ae4_device, pt);
          ae4cmd_q = &ae4->ae4cmd_q[chan->id];
          mutex_lock(&ae4cmd_q->cmd_lock);
@@ -367,13 +448,16 @@ static void pt_issue_pending(struct dma_chan 
*dma_chan)
  {
      struct pt_dma_chan *chan = to_pt_chan(dma_chan);
      struct pt_dma_desc *desc;
+    struct pt_device *pt;
      unsigned long flags;
      bool engine_is_idle = true;
  +    pt = chan->pt;
+
      spin_lock_irqsave(&chan->vc.lock, flags);
        desc = pt_next_dma_desc(chan);
-    if (desc)
+    if (desc && pt->ver != AE4_DMA_VERSION)
          engine_is_idle = false;
        vchan_issue_pending(&chan->vc);
--
2.25.1