On Mon, 2024-06-17 at 15:33 +0530, Basavaraj Natikar wrote: > Use the pt_dmaengine_register function to register a AE4DMA DMA > engine. > > Reviewed-by: Raju Rangoju <Raju.Rangoju@xxxxxxx> > Signed-off-by: Basavaraj Natikar <Basavaraj.Natikar@xxxxxxx> > --- > drivers/dma/amd/ae4dma/Makefile | 2 +- > drivers/dma/amd/ae4dma/ae4dma-dev.c | 73 > +++++++++++++++++++++++++++++ > drivers/dma/amd/ae4dma/ae4dma-pci.c | 1 + > drivers/dma/amd/ae4dma/ae4dma.h | 2 + > 4 files changed, 77 insertions(+), 1 deletion(-) > > diff --git a/drivers/dma/amd/ae4dma/Makefile > b/drivers/dma/amd/ae4dma/Makefile > index e918f85a80ec..165d1c74b732 100644 > --- a/drivers/dma/amd/ae4dma/Makefile > +++ b/drivers/dma/amd/ae4dma/Makefile > @@ -5,6 +5,6 @@ > > obj-$(CONFIG_AMD_AE4DMA) += ae4dma.o > > -ae4dma-objs := ae4dma-dev.o > +ae4dma-objs := ae4dma-dev.o ../ptdma/ptdma-dmaengine.o > ../common/amd_dma.o > > ae4dma-$(CONFIG_PCI) += ae4dma-pci.o > diff --git a/drivers/dma/amd/ae4dma/ae4dma-dev.c > b/drivers/dma/amd/ae4dma/ae4dma-dev.c > index 958bdab8db59..77c37649d8d1 100644 > --- a/drivers/dma/amd/ae4dma/ae4dma-dev.c > +++ b/drivers/dma/amd/ae4dma/ae4dma-dev.c > @@ -60,6 +60,15 @@ static void ae4_check_status_error(struct > ae4_cmd_queue *ae4cmd_q, int idx) > } > } > > +void pt_check_status_trans(struct pt_device *pt, struct pt_cmd_queue > *cmd_q) > +{ > + struct ae4_cmd_queue *ae4cmd_q = container_of(cmd_q, struct > ae4_cmd_queue, cmd_q); > + int i; > + > + for (i = 0; i < CMD_Q_LEN; i++) > + ae4_check_status_error(ae4cmd_q, i); > +} > + > static void ae4_pending_work(struct work_struct *work) > { > struct ae4_cmd_queue *ae4cmd_q = container_of(work, struct > ae4_cmd_queue, p_work.work); > @@ -123,6 +132,66 @@ static irqreturn_t ae4_core_irq_handler(int irq, > void *data) > return IRQ_HANDLED; > } > > +static int ae4_core_execute_cmd(struct ae4dma_desc *desc, struct > ae4_cmd_queue *ae4cmd_q) > +{ Hi, The memory ordering in this function seems to be addressed through several different mechanisms simultaneously? > + bool soc = FIELD_GET(DWORD0_SOC, desc->dwouv.dw0); > + struct pt_cmd_queue *cmd_q = &ae4cmd_q->cmd_q; > + u32 tail_wi; > + > + if (soc) { > + desc->dwouv.dw0 |= FIELD_PREP(DWORD0_IOC, desc- > >dwouv.dw0); > + desc->dwouv.dw0 &= ~DWORD0_SOC; > + } > + > + mutex_lock(&ae4cmd_q->cmd_lock); > + > + tail_wi = atomic_read(&ae4cmd_q->tail_wi); > + memcpy(&cmd_q->qbase[tail_wi], desc, sizeof(struct > ae4dma_desc)); > + > + atomic64_inc(&ae4cmd_q->q_cmd_count); > + > + tail_wi = (tail_wi + 1) % CMD_Q_LEN; > + > + atomic_set(&ae4cmd_q->tail_wi, tail_wi); > + /* Synchronize ordering */ > + mb(); This mb() should be surplus because writel() should have its own barrier already. > + > + writel(tail_wi, cmd_q->reg_control + 0x10); > + /* Synchronize ordering */ > + mb(); > + > + mutex_unlock(&ae4cmd_q->cmd_lock); Same here – the compiler can't change the order of writel() and the subsequent mutex_unlock(). If that were the case the entire kernel would explode. So it seems there are three mechanisms in action here: 1. mutex 2. atomics 3. memory barriers Can't the ordering be ensured by the mutex alone? Regards, P. > + > + wake_up(&ae4cmd_q->q_w); > + > + return 0; > +} > + > +int pt_core_perform_passthru(struct pt_cmd_queue *cmd_q, > + struct pt_passthru_engine *pt_engine) > +{ > + struct ae4_cmd_queue *ae4cmd_q = container_of(cmd_q, struct > ae4_cmd_queue, cmd_q); > + struct ae4dma_desc desc; > + > + cmd_q->cmd_error = 0; > + cmd_q->total_pt_ops++; > + memset(&desc, 0, sizeof(desc)); > + desc.dwouv.dws.byte0 = CMD_AE4_DESC_DW0_VAL; > + > + desc.dw1.status = 0; > + desc.dw1.err_code = 0; > + desc.dw1.desc_id = 0; > + > + desc.length = pt_engine->src_len; > + > + desc.src_lo = upper_32_bits(pt_engine->src_dma); > + desc.src_hi = lower_32_bits(pt_engine->src_dma); > + desc.dst_lo = upper_32_bits(pt_engine->dst_dma); > + desc.dst_hi = lower_32_bits(pt_engine->dst_dma); > + > + return ae4_core_execute_cmd(&desc, ae4cmd_q); > +} > + > void ae4_destroy_work(struct ae4_device *ae4) > { > struct ae4_cmd_queue *ae4cmd_q; > @@ -202,5 +271,9 @@ int ae4_core_init(struct ae4_device *ae4) > init_completion(&ae4cmd_q->cmp); > } > > + ret = pt_dmaengine_register(pt); > + if (ret) > + ae4_destroy_work(ae4); > + > return ret; > } > diff --git a/drivers/dma/amd/ae4dma/ae4dma-pci.c > b/drivers/dma/amd/ae4dma/ae4dma-pci.c > index ddebf0609c4d..5450fa551eea 100644 > --- a/drivers/dma/amd/ae4dma/ae4dma-pci.c > +++ b/drivers/dma/amd/ae4dma/ae4dma-pci.c > @@ -131,6 +131,7 @@ static int ae4_pci_probe(struct pci_dev *pdev, > const struct pci_device_id *id) > > pt = &ae4->pt; > pt->dev = dev; > + pt->ver = AE4_DMA_VERSION; > > pt->io_regs = pcim_iomap_table(pdev)[0]; > if (!pt->io_regs) { > diff --git a/drivers/dma/amd/ae4dma/ae4dma.h > b/drivers/dma/amd/ae4dma/ae4dma.h > index 4e4584e152a1..f1b6dcc1d8c3 100644 > --- a/drivers/dma/amd/ae4dma/ae4dma.h > +++ b/drivers/dma/amd/ae4dma/ae4dma.h > @@ -16,6 +16,7 @@ > > #define AE4_DESC_COMPLETED 0x3 > #define AE4_DMA_VERSION 4 > +#define CMD_AE4_DESC_DW0_VAL 2 > > struct ae4_msix { > int msix_count; > @@ -36,6 +37,7 @@ struct ae4_cmd_queue { > atomic64_t done_cnt; > atomic64_t q_cmd_count; > atomic_t dridx; > + atomic_t tail_wi; > unsigned int id; > }; >