[PATCH 02/16] genpt: Add a specialized allocator for page table levels

Jason Gunthorpe <jgg@xxxxxxxxxx> · Thu, 15 Aug 2024 12:11:18 -0300

A radix or "page table level" is the memory inside the page table used to
store the data. Generally formats have a fixed size for these tables and
all are uniform. It is usually PAGE_SIZE of their respective
architectures, but not always. Often the top most table level has a
different size than the rest.

The key function of this allocator is a way to maintain a linked list of
the memory, and a RCU free capability of those lists. Most of the
algorithms in the iommu implementation rely on the linked lists, and the
RCU is necessary for debugfs support.

Use the new folio-ish infrastructure for creating a custom struct page to
store the additional data.

Included in this is some support for managing the CPU cache invalidation
algorithm that ARM uses. The folio is used to record when the table memory
has been DMA mapped along with helpers to DMA API map/unmap the memory.

FIXME: Several of the formats require sub-page sizes (ie ARMv7s uses 1k
tables pages on a 4k architecture, ARMv8 can use 4k/16k/64k pages
regardless of the CPU PAGE_SIZE). 4:1 can be handled by giving up on the
no-allocate RCU and storing 4 next pointers directly in the folio. The
16:1 case would require allocating additional memory to hold the metadata,
much like Matthew's proposed memdesc. In a future memdesc world the
per-folio metadata would be allocated to the required size. This logic is
not implemented yet.

FIXME:
 - sub-page sizes. Without support it wastes memory but is suitable for
   funtional testing.
 - This has become weirdly named
 - This is general, except it does use NR_IOMMU_PAGES

Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx>
---
 drivers/iommu/generic_pt/Kconfig    |   8 ++
 drivers/iommu/generic_pt/Makefile   |   4 +
 drivers/iommu/generic_pt/pt_alloc.c | 174 ++++++++++++++++++++++++++++
 drivers/iommu/generic_pt/pt_alloc.h |  98 ++++++++++++++++
 4 files changed, 284 insertions(+)
 create mode 100644 drivers/iommu/generic_pt/pt_alloc.c
 create mode 100644 drivers/iommu/generic_pt/pt_alloc.h

diff --git a/drivers/iommu/generic_pt/Kconfig b/drivers/iommu/generic_pt/Kconfig
index 775a3afb563f72..c22a55b00784d0 100644
--- a/drivers/iommu/generic_pt/Kconfig
+++ b/drivers/iommu/generic_pt/Kconfig
@@ -19,4 +19,12 @@ config DEBUG_GENERIC_PT
 	  kernels.
 
 	  The kunit tests require this to be enabled to get full coverage.
+
+config IOMMU_PT
+	tristate "IOMMU Page Tables"
+	depends on IOMMU_SUPPORT
+	depends on GENERIC_PT
+	default n
+	help
+	  Generic library for building IOMMU page tables
 endif
diff --git a/drivers/iommu/generic_pt/Makefile b/drivers/iommu/generic_pt/Makefile
index f66554cd5c4518..f7862499642237 100644
--- a/drivers/iommu/generic_pt/Makefile
+++ b/drivers/iommu/generic_pt/Makefile
@@ -1 +1,5 @@
 # SPDX-License-Identifier: GPL-2.0
+iommu_pt-y := \
+	pt_alloc.o
+
+obj-$(CONFIG_IOMMU_PT) += iommu_pt.o
diff --git a/drivers/iommu/generic_pt/pt_alloc.c b/drivers/iommu/generic_pt/pt_alloc.c
new file mode 100644
index 00000000000000..4ee032161103f3
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_alloc.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#include "pt_alloc.h"
+#include "pt_log2.h"
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+
+#define RADIX_MATCH(pg, rl)                        \
+	static_assert(offsetof(struct page, pg) == \
+		      offsetof(struct pt_radix_meta, rl))
+RADIX_MATCH(flags, __page_flags);
+RADIX_MATCH(rcu_head, rcu_head);	/* Ensure bit 0 is clear */
+RADIX_MATCH(mapping, __page_mapping);
+RADIX_MATCH(private, free_next);
+RADIX_MATCH(page_type, __page_type);
+RADIX_MATCH(_refcount, __page_refcount);
+#ifdef CONFIG_MEMCG
+RADIX_MATCH(memcg_data, memcg_data);
+#endif
+#undef RADIX_MATCH
+static_assert(sizeof(struct pt_radix_meta) <= sizeof(struct page));
+
+static inline struct folio *meta_to_folio(struct pt_radix_meta *meta)
+{
+	return (struct folio *)meta;
+}
+
+void *pt_radix_alloc(struct pt_common *owner, int nid, size_t lg2sz, gfp_t gfp)
+{
+	struct pt_radix_meta *meta;
+	unsigned int order;
+	struct folio *folio;
+
+	/*
+	 * FIXME we need to support sub page size tables, eg to allow a 4K table
+	 * on a 64K kernel. This should be done by allocating extra memory
+	 * per page and placing the pointer in the meta. The extra memory can
+	 * contain the additional list heads and rcu's required.
+	 */
+	if (lg2sz <= PAGE_SHIFT)
+		order = 0;
+	else
+		order = lg2sz - PAGE_SHIFT;
+
+	folio = (struct folio *)alloc_pages_node(
+		nid, gfp | __GFP_ZERO | __GFP_COMP, order);
+	if (!folio)
+		return ERR_PTR(-ENOMEM);
+
+	meta = folio_to_meta(folio);
+	meta->owner = owner;
+	meta->free_next = NULL;
+	meta->lg2sz = lg2sz;
+
+	mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES,
+			    log2_to_int_t(long, order));
+	lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE,
+			      log2_to_int_t(long, order));
+
+	return folio_address(folio);
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_alloc, GENERIC_PT);
+
+void pt_radix_free_list(struct pt_radix_list_head *list)
+{
+	struct pt_radix_meta *cur = list->head;
+
+	while (cur) {
+		struct folio *folio = meta_to_folio(cur);
+		unsigned int order = folio_order(folio);
+		long pgcnt = 1UL << order;
+
+		mod_node_page_state(folio_pgdat(folio), NR_IOMMU_PAGES, -pgcnt);
+		lruvec_stat_mod_folio(folio, NR_SECONDARY_PAGETABLE, -pgcnt);
+
+		cur = cur->free_next;
+		folio->mapping = NULL;
+		__free_pages(&folio->page, order);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_free_list, GENERIC_PT);
+
+void pt_radix_free(void *radix)
+{
+	struct pt_radix_meta *meta = virt_to_meta(radix);
+	struct pt_radix_list_head list = { .head = meta };
+
+	pt_radix_free_list(&list);
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_free, GENERIC_PT);
+
+static void pt_radix_free_list_rcu_cb(struct rcu_head *head)
+{
+	struct pt_radix_meta *meta =
+		container_of(head, struct pt_radix_meta, rcu_head);
+	struct pt_radix_list_head list = { .head = meta };
+
+	pt_radix_free_list(&list);
+}
+
+void pt_radix_free_list_rcu(struct pt_radix_list_head *list)
+{
+	if (!list->head)
+		return;
+	call_rcu(&list->head->rcu_head, pt_radix_free_list_rcu_cb);
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_free_list_rcu, GENERIC_PT);
+
+/*
+ * For incoherent memory we use the DMA API to manage the cache flushing. This
+ * is a lot of complexity compared to just calling arch_sync_dma_for_device(),
+ * but it is what the existing iommu drivers have been doing.
+ */
+int pt_radix_start_incoherent(void *radix, struct device *dma_dev,
+			      bool still_flushing)
+{
+	struct pt_radix_meta *meta = virt_to_meta(radix);
+	dma_addr_t dma;
+
+	dma = dma_map_single(dma_dev, radix, log2_to_int_t(size_t, meta->lg2sz),
+			     DMA_TO_DEVICE);
+	if (dma_mapping_error(dma_dev, dma))
+		return -EINVAL;
+
+	/* The DMA API is not allowed to do anything other than DMA direct. */
+	if (WARN_ON(dma != virt_to_phys(radix))) {
+		dma_unmap_single(dma_dev, dma,
+				 log2_to_int_t(size_t, meta->lg2sz),
+				 DMA_TO_DEVICE);
+		return -EOPNOTSUPP;
+	}
+	meta->incoherent = 1;
+	meta->still_flushing = 1;
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_start_incoherent, GENERIC_PT);
+
+int pt_radix_start_incoherent_list(struct pt_radix_list_head *list,
+				   struct device *dma_dev)
+{
+	struct pt_radix_meta *cur;
+	int ret;
+
+	for (cur = list->head; cur; cur = cur->free_next) {
+		if (cur->incoherent)
+			continue;
+
+		ret = pt_radix_start_incoherent(
+			folio_address(meta_to_folio(cur)), dma_dev, false);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_start_incoherent_list, GENERIC_PT);
+
+void pt_radix_stop_incoherent_list(struct pt_radix_list_head *list,
+				   struct device *dma_dev)
+{
+	struct pt_radix_meta *cur;
+
+	for (cur = list->head; cur; cur = cur->free_next) {
+		struct folio *folio = meta_to_folio(cur);
+
+		if (!cur->incoherent)
+			continue;
+		dma_unmap_single(dma_dev, virt_to_phys(folio_address(folio)),
+				 log2_to_int_t(size_t, cur->lg2sz),
+				 DMA_TO_DEVICE);
+	}
+}
+EXPORT_SYMBOL_NS_GPL(pt_radix_stop_incoherent_list, GENERIC_PT);
diff --git a/drivers/iommu/generic_pt/pt_alloc.h b/drivers/iommu/generic_pt/pt_alloc.h
new file mode 100644
index 00000000000000..9751cc63b7d13f
--- /dev/null
+++ b/drivers/iommu/generic_pt/pt_alloc.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+ */
+#ifndef __GENERIC_PT_PT_ALLOC_H
+#define __GENERIC_PT_PT_ALLOC_H
+
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/device.h>
+
+/*
+ * Per radix table level allocation meta data. This is very similar in purpose
+ * to the struct ptdesc.
+ *
+ * radix levels have special properties:
+ *   - Always a power of two size
+ *   - Can be threaded on a list without a memory allocation
+ *   - Can be RCU freed without a memory allocation
+ */
+struct pt_radix_meta {
+	unsigned long __page_flags;
+
+	struct rcu_head rcu_head;
+	union {
+		struct {
+			u8 lg2sz;
+			u8 incoherent;
+			u8 still_flushing;
+		};
+		unsigned long __page_mapping;
+	};
+	struct pt_common *owner;
+	struct pt_radix_meta *free_next;
+
+	unsigned int __page_type;
+	atomic_t __page_refcount;
+#ifdef CONFIG_MEMCG
+	unsigned long memcg_data;
+#endif
+};
+
+static inline struct pt_radix_meta *folio_to_meta(struct folio *folio)
+{
+	return (struct pt_radix_meta *)folio;
+}
+
+static inline struct pt_radix_meta *virt_to_meta(const void *addr)
+{
+	return folio_to_meta(virt_to_folio(addr));
+}
+
+struct pt_radix_list_head {
+	struct pt_radix_meta *head;
+};
+
+void *pt_radix_alloc(struct pt_common *owner, int nid, size_t log2size,
+		     gfp_t gfp);
+void pt_radix_free(void *radix);
+void pt_radix_free_list(struct pt_radix_list_head *list);
+void pt_radix_free_list_rcu(struct pt_radix_list_head *list);
+
+static inline void pt_radix_add_list(struct pt_radix_list_head *head,
+				     void *radix)
+{
+	struct pt_radix_meta *meta = virt_to_meta(radix);
+
+	meta->free_next = head->head;
+	head->head = meta->free_next;
+}
+
+int pt_radix_start_incoherent(void *radix, struct device *dma_dev,
+			      bool still_flushing);
+int pt_radix_start_incoherent_list(struct pt_radix_list_head *list,
+				   struct device *dma_dev);
+void pt_radix_stop_incoherent_list(struct pt_radix_list_head *list,
+				   struct device *dma_dev);
+
+static inline void pt_radix_done_incoherent_flush(void *radix)
+{
+	struct pt_radix_meta *meta = virt_to_meta(radix);
+
+	/*
+	 * Release/acquire is against the cache flush,
+	 * pt_radix_still_incoherent() must not return 0 until the HW observes
+	 * the flush.
+	 */
+	smp_store_release(&meta->still_flushing, 0);
+}
+
+static inline bool pt_radix_incoherent_still_flushing(void *radix)
+{
+	struct pt_radix_meta *meta = virt_to_meta(radix);
+
+	return smp_load_acquire(&meta->still_flushing);
+}
+
+#endif
-- 
2.46.0