[(subset) PATCH v2 3/3] riscv: dma-mapping: replace custom code with generic implementation

Prabhakar <prabhakar.csengg@xxxxxxxxx> · Mon, 14 Aug 2023 21:28:21 +0100

From: Arnd Bergmann <arnd@xxxxxxxx>

Now that all of these have consistent behavior, replace them with
a single shared implementation of arch_sync_dma_for_device() and
arch_sync_dma_for_cpu() and three parameters to pick how they should
operate:

 - If the CPU has speculative prefetching, then the cache
   has to be invalidated after a transfer from the device.
   On the rarer CPUs without prefetching, this can be skipped,
   with all cache management happening before the transfer.
   This flag can be runtime detected, but is usually fixed
   per architecture.

 - Some architectures currently clean the caches before DMA
   from a device, while others invalidate it. There has not
   been a conclusion regarding whether we should change all
   architectures to use clean instead, so this adds an
   architecture specific flag that we can change later on.

For the function naming, I picked 'wback' over 'clean', and 'wback_inv'
over 'flush', to avoid any ambiguity of what the helper functions are
supposed to do.

Moving the global functions into a header file is usually a bad idea
as it prevents the header from being included more than once, but it
helps keep the behavior as close as possible to the previous state,
including the possibility of inlining most of it into these functions
where that was done before. This also helps keep the global namespace
clean, by hiding the new arch_dma_cache{_wback,_inv,_wback_inv} from
device drivers that might use them incorrectly.

Signed-off-by: Arnd Bergmann <arnd@xxxxxxxx>
Reviewed-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@xxxxxxxxxxxxxx>
Tested-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@xxxxxxxxxxxxxx>
[PL: Dropped other archs, updated commit message and fixed checkpatch issues]
Signed-off-by: Lad Prabhakar <prabhakar.mahadev-lad.rj@xxxxxxxxxxxxxx>
---
v1->v2
* Updated commit message
* Fixed checkpatch issues
* Dropped other archs
---
 arch/riscv/mm/dma-noncoherent.c |  50 +++++++-------
 include/linux/dma-sync.h        | 113 ++++++++++++++++++++++++++++++++
 2 files changed, 136 insertions(+), 27 deletions(-)
 create mode 100644 include/linux/dma-sync.h

diff --git a/arch/riscv/mm/dma-noncoherent.c b/arch/riscv/mm/dma-noncoherent.c
index fc6377a64c8d..b6a1e9cc9339 100644
--- a/arch/riscv/mm/dma-noncoherent.c
+++ b/arch/riscv/mm/dma-noncoherent.c
@@ -12,43 +12,39 @@
 
 static bool noncoherent_supported __ro_after_init;
 
-void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
-			      enum dma_data_direction dir)
+static inline void arch_dma_cache_wback(phys_addr_t paddr, size_t size)
 {
 	void *vaddr = phys_to_virt(paddr);
 
-	switch (dir) {
-	case DMA_TO_DEVICE:
-		ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
-		break;
-	case DMA_FROM_DEVICE:
-		ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
-		break;
-	case DMA_BIDIRECTIONAL:
-		ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
-		break;
-	default:
-		break;
-	}
+	ALT_CMO_OP(clean, vaddr, size, riscv_cbom_block_size);
 }
 
-void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
-			   enum dma_data_direction dir)
+static inline void arch_dma_cache_inv(phys_addr_t paddr, size_t size)
 {
 	void *vaddr = phys_to_virt(paddr);
 
-	switch (dir) {
-	case DMA_TO_DEVICE:
-		break;
-	case DMA_FROM_DEVICE:
-	case DMA_BIDIRECTIONAL:
-		ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
-		break;
-	default:
-		break;
-	}
+	ALT_CMO_OP(inval, vaddr, size, riscv_cbom_block_size);
 }
 
+static inline void arch_dma_cache_wback_inv(phys_addr_t paddr, size_t size)
+{
+	void *vaddr = phys_to_virt(paddr);
+
+	ALT_CMO_OP(flush, vaddr, size, riscv_cbom_block_size);
+}
+
+static inline bool arch_sync_dma_clean_before_fromdevice(void)
+{
+	return true;
+}
+
+static inline bool arch_sync_dma_cpu_needs_post_dma_flush(void)
+{
+	return true;
+}
+
+#include <linux/dma-sync.h>
+
 void arch_dma_prep_coherent(struct page *page, size_t size)
 {
 	void *flush_addr = page_address(page);
diff --git a/include/linux/dma-sync.h b/include/linux/dma-sync.h
new file mode 100644
index 000000000000..be23e8dda2e2
--- /dev/null
+++ b/include/linux/dma-sync.h
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * Cache operations depending on function and direction argument, inspired by
+ * https://lore.kernel.org/lkml/20180518175004.GF17671@xxxxxxxxxxxxxxxxxxxxx
+ * "dma_sync_*_for_cpu and direction=TO_DEVICE (was Re: [PATCH 02/20]
+ * dma-mapping: provide a generic dma-noncoherent implementation)"
+ *
+ *          |   map          ==  for_device     |   unmap     ==  for_cpu
+ *          |----------------------------------------------------------------
+ * TO_DEV   |   writeback        writeback      |   none          none
+ * FROM_DEV |   invalidate       invalidate     |   invalidate*   invalidate*
+ * BIDIR    |   writeback        writeback      |   invalidate    invalidate
+ *
+ *     [*] needed for CPU speculative prefetches
+ *
+ * NOTE: we don't check the validity of direction argument as it is done in
+ * upper layer functions (in include/linux/dma-mapping.h)
+ *
+ * This file can be included by arch/.../kernel/dma-noncoherent.c to provide
+ * the respective high-level operations without having to expose the
+ * cache management ops to drivers.
+ */
+
+#ifndef __DMA_SYNC_H__
+#define __DMA_SYNC_H__
+
+void arch_sync_dma_for_device(phys_addr_t paddr, size_t size,
+			      enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_TO_DEVICE:
+		/*
+		 * This may be an empty function on write-through caches,
+		 * and it might invalidate the cache if an architecture has
+		 * a write-back cache but no way to write it back without
+		 * invalidating
+		 */
+		arch_dma_cache_wback(paddr, size);
+		break;
+
+	case DMA_FROM_DEVICE:
+		/*
+		 * FIXME: this should be handled the same across all
+		 * architectures, see
+		 * https://lore.kernel.org/all/20220606152150.GA31568@willie-the-truck/
+		 */
+		if (!arch_sync_dma_clean_before_fromdevice()) {
+			arch_dma_cache_inv(paddr, size);
+			break;
+		}
+		fallthrough;
+
+	case DMA_BIDIRECTIONAL:
+		/* Skip the invalidate here if it's done later */
+		if (IS_ENABLED(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) &&
+		    arch_sync_dma_cpu_needs_post_dma_flush())
+			arch_dma_cache_wback(paddr, size);
+		else
+			arch_dma_cache_wback_inv(paddr, size);
+		break;
+
+	default:
+		break;
+	}
+}
+
+#ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU
+/*
+ * Mark the D-cache clean for these pages to avoid extra flushing.
+ */
+static void arch_dma_mark_dcache_clean(phys_addr_t paddr, size_t size)
+{
+#ifdef CONFIG_ARCH_DMA_MARK_DCACHE_CLEAN
+	unsigned long pfn = PFN_UP(paddr);
+	unsigned long off = paddr & (PAGE_SIZE - 1);
+	size_t left = size;
+
+	if (off)
+		left -= PAGE_SIZE - off;
+
+	while (left >= PAGE_SIZE) {
+		struct page *page = pfn_to_page(pfn++);
+
+		set_bit(PG_dcache_clean, &page->flags);
+		left -= PAGE_SIZE;
+	}
+#endif
+}
+
+void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
+			   enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_TO_DEVICE:
+		break;
+
+	case DMA_FROM_DEVICE:
+	case DMA_BIDIRECTIONAL:
+		/* FROM_DEVICE invalidate needed if speculative CPU prefetch only */
+		if (arch_sync_dma_cpu_needs_post_dma_flush())
+			arch_dma_cache_inv(paddr, size);
+
+		if (size > PAGE_SIZE)
+			arch_dma_mark_dcache_clean(paddr, size);
+		break;
+
+	default:
+		break;
+	}
+}
+#endif
+#endif /* __DMA_SYNC_H__ */
-- 
2.34.1