The cut operation allows breaking large pages into smaller pages with specific purpose of allowing unmapping of a portion of the previously large page. Specifically it ensures that no large page crosses the cut point, and thus, we can successfully unmap starting/ending on the cut point. This is the operation that VFIO type 1 v1.0 implicitly supported and some iommu drivers provided internal to their unmap operation. Implement the cut operation to be hitless, changes to the page table during cutting must cause zero disruption to any ongoing DMA. This is the expectation of the VFIO type 1 uAPI. Hitless requires HW support, it is incompatible with HW requiring break-before-make. Having it separate from unmap makes it much easier to handle failure cases. Since cut is fully hitless, even in failure cases, a caller can simply do nothing if cut fails. Cut during unmap requries dealing with the potentially nasty case where some of the IOVA range has been unmapped but some cannot be unmapped. The operation is generalized into a form that iommufd can use for it's existing area split operation. By placing cuts on the boundaries of the split we can subdivide an area and maintain all existing semantics even with large pages in the page table. Cut is optimal and generates a page table that is equivalent to calling map twice, that is the two halves will still use maximal page sizes. FIXME: requires deeper kunit tests Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxx> --- drivers/iommu/generic_pt/iommu_pt.h | 263 ++++++++++++++++++++++++++++ include/linux/generic_pt/iommu.h | 29 +++ 2 files changed, 292 insertions(+) diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h index a886c94a33eb6c..4fccdcd58d4ba6 100644 --- a/drivers/iommu/generic_pt/iommu_pt.h +++ b/drivers/iommu/generic_pt/iommu_pt.h @@ -232,6 +232,268 @@ struct pt_iommu_map_args { pt_oaddr_t oa; }; +/* + * Build an entire sub tree of tables separate from the active table. This is + * used to build an entire mapping and then once complete atomically place it in + * the table. This is a simplified version of map since we know there is no + * concurrency and all the tables start zero filled. + */ +static int __build_tree(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct pt_state pts = pt_init(range, level, table); + struct pt_iommu_map_args *build = arg; + int ret; + + for_each_pt_level_item(&pts) { + unsigned int pgsize_lg2 = + pt_compute_best_pgsize(&pts, build->oa); + + if (pgsize_lg2) { + /* Private population can not see table entries other than 0. */ + if (PT_WARN_ON(pts.type != PT_ENTRY_EMPTY)) + return -EADDRINUSE; + + pt_install_leaf_entry(&pts, build->oa, pgsize_lg2, + &build->attrs); + pts.type = PT_ENTRY_OA; + build->oa += log2_to_int(pgsize_lg2); + continue; + } + + if (pts.type == PT_ENTRY_EMPTY) { + /* start_incoherent is done after the table is filled */ + ret = pt_iommu_new_table(&pts, &build->attrs, true); + if (ret) + return ret; + pt_radix_add_list(&build->free_list, pts.table_lower); + } else if (PT_WARN_ON(pts.type != PT_ENTRY_TABLE)) { + return -EINVAL; + } + + ret = pt_descend(&pts, arg, __build_tree); + if (ret) + return ret; + } + return 0; +} + +/* + * Replace the OA entry patent_pts points at with a tree of OA entries. The tree + * is organized so that parent_pts->va is a cut point. The created mappings will + * still have optimized page sizes within the cut point. + */ +static int replace_cut_table(struct pt_state *parent_pts, + const struct pt_write_attrs *parent_attrs) +{ + struct pt_common *common = parent_pts->range->common; + struct pt_iommu_map_args build = { + .attrs = *parent_attrs, + .oa = pt_entry_oa(parent_pts), + }; + pt_vaddr_t cut_start_va = parent_pts->range->va; + pt_vaddr_t entry_start_va = + log2_set_mod(cut_start_va, 0, pt_table_item_lg2sz(parent_pts)); + pt_vaddr_t entry_last_va = + log2_set_mod_max(cut_start_va, pt_table_item_lg2sz(parent_pts)); + struct pt_table_p *table_mem; + int ret; + + if (unlikely(!pt_can_have_table(parent_pts))) + return -ENXIO; + + if (PT_WARN_ON(entry_start_va == cut_start_va)) + return -ENXIO; + + if (!pts_feature(parent_pts, PT_FEAT_OA_TABLE_XCHG)) + return -EOPNOTSUPP; + + table_mem = table_alloc(parent_pts, parent_attrs->gfp, true); + if (IS_ERR(table_mem)) + return PTR_ERR(table_mem); + pt_radix_add_list(&build.free_list, table_mem); + parent_pts->table_lower = table_mem; + + /* Fill from the start of the table to the cut point */ + ret = pt_walk_child_range(parent_pts, entry_start_va, cut_start_va - 1, + __build_tree, &build); + if (ret) + goto err_free; + + /* Fill from the cut point to the end of the table */ + ret = pt_walk_child_range(parent_pts, cut_start_va, entry_last_va, + __build_tree, &build); + if (ret) + goto err_free; + + /* + * Avoid double flushing when building a tree privately. All the tree + * memory is initialized now so flush it before installing it. This + * thread is the exclusive owner of the item being split so we don't + * need to worry about still flushing. + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) { + ret = pt_radix_start_incoherent_list( + &build.free_list, + iommu_from_common(common)->iommu_device); + if (ret) + goto err_free; + } + + if (!pt_install_table(parent_pts, virt_to_phys(table_mem), + parent_attrs)) { + /* + * This only fails if the table entry changed while we were + * building the sub tree, which would be a locking violation. + */ + WARN(true, "Locking violating during cut"); + ret = -EINVAL; + goto err_free; + } + + return 0; + +err_free: + /* + * None of the allocated memory was ever reachable outside this function + */ + if (pt_feature(common, PT_FEAT_DMA_INCOHERENT)) + pt_radix_stop_incoherent_list( + &build.free_list, + iommu_from_common(common)->iommu_device); + pt_radix_free_list(&build.free_list); + parent_pts->table_lower = NULL; + return ret; +} + +static void __replace_cut_entry(const struct pt_state *parent_pts, + struct pt_iommu_map_args *replace, + unsigned int start_index, + unsigned int end_index) +{ + struct pt_range range = + pt_range_slice(parent_pts, start_index, end_index); + struct pt_state pts = + pt_init(&range, parent_pts->level, parent_pts->table); + + if (start_index == end_index) + return; + + for_each_pt_level_item(&pts) { + unsigned int pgsize_lg2 = + pt_compute_best_pgsize(&pts, replace->oa); + + if (PT_WARN_ON(pts.type != PT_ENTRY_OA) || + PT_WARN_ON(!pgsize_lg2)) + continue; + + pt_install_leaf_entry(&pts, replace->oa, pgsize_lg2, + &replace->attrs); + pts.type = PT_ENTRY_OA; + replace->oa += log2_to_int(pgsize_lg2); + } +} + +/* + * This is a little more complicated than just clearing a contig bit because + * some formats have multi-size contigs and we still want to use best page sizes + * for each half of the cut. So we remap over the current values with new + * correctly sized entries. + */ +static void replace_cut_entry(const struct pt_state *parent_pts, + const struct pt_write_attrs *parent_attrs) +{ + struct pt_iommu_map_args replace = { + .attrs = *parent_attrs, + .oa = pt_entry_oa(parent_pts), + }; + unsigned int start_index = log2_set_mod( + parent_pts->index, 0, pt_entry_num_contig_lg2(parent_pts)); + unsigned int cut_index = parent_pts->index; + unsigned int last_index = log2_set_mod( + parent_pts->index, + log2_to_int(pt_entry_num_contig_lg2(parent_pts)) - 1, + pt_entry_num_contig_lg2(parent_pts)); + + pt_attr_from_entry(parent_pts, &replace.attrs); + + if (!log2_mod(parent_pts->range->va, pt_table_item_lg2sz(parent_pts))) { + /* The cut start at an item boundary */ + __replace_cut_entry(parent_pts, &replace, start_index, + cut_index); + __replace_cut_entry(parent_pts, &replace, cut_index, + last_index + 1); + } else { + /* cut_index will be replaced by a table */ + if (start_index != cut_index) + __replace_cut_entry(parent_pts, &replace, start_index, + cut_index - 1); + __replace_cut_entry(parent_pts, &replace, cut_index, + cut_index + 1); + if (cut_index != last_index) + __replace_cut_entry(parent_pts, &replace, cut_index + 1, + last_index + 1); + } +} + +static int __cut_mapping(struct pt_range *range, void *arg, unsigned int level, + struct pt_table_p *table) +{ + struct iommu_write_log wlog __cleanup(done_writes) = { .range = range }; + struct pt_state pts = pt_init(range, level, table); + const struct pt_write_attrs *cut_attrs = arg; + +again: + switch (pt_load_single_entry(&pts)) { + case PT_ENTRY_EMPTY: + return -ENOENT; + case PT_ENTRY_TABLE: + return pt_descend(&pts, arg, __cut_mapping); + case PT_ENTRY_OA: { + /* This entry's OA starts at the cut point, all done */ + if (!log2_mod(range->va, pt_entry_oa_lg2sz(&pts))) + return 0; + + record_write(&wlog, &pts, pt_entry_num_contig_lg2(&pts)); + + /* This is a contiguous entry, split it down */ + if (pt_entry_num_contig_lg2(&pts) != ilog2(1)) { + if (!pts_feature(&pts, PT_FEAT_OA_SIZE_CHANGE)) + return -EOPNOTSUPP; + replace_cut_entry(&pts, cut_attrs); + goto again; + } + + /* + * Need to replace an OA with a table. The new table will map + * the same OA as the table item, just with smaller granularity. + */ + return replace_cut_table(&pts, cut_attrs); + } + } + return -ENOENT; +} + +/* + * FIXME this is currently incompatible with active dirty tracking as we + * don't take care to capture or propagate the dirty bits during the mutation. + */ +static int NS(cut_mapping)(struct pt_iommu *iommu_table, dma_addr_t cut_iova, + gfp_t gfp) +{ + struct pt_write_attrs cut_attrs = { + .gfp = gfp, + }; + struct pt_range range; + int ret; + + ret = make_range(common_from_iommu(iommu_table), &range, cut_iova, 1); + if (ret) + return ret; + + return pt_walk_range(&range, __cut_mapping, &cut_attrs); +} + /* * Check that the items in a contiguous block are all empty. This will * recursively check any tables in the block to validate they are empty and @@ -624,6 +886,7 @@ static const struct pt_iommu_ops NS(ops) = { .map_pages = NS(map_pages), .unmap_pages = NS(unmap_pages), .iova_to_phys = NS(iova_to_phys), + .cut_mapping = NS(cut_mapping), .get_info = NS(get_info), .deinit = NS(deinit), }; diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h index 88e45d21dd21c4..d83f293209fa77 100644 --- a/include/linux/generic_pt/iommu.h +++ b/include/linux/generic_pt/iommu.h @@ -113,6 +113,35 @@ struct pt_iommu_ops { dma_addr_t len, struct iommu_iotlb_gather *iotlb_gather); + /** + * cut_mapping() - Split a mapping + * @iommu_table: Table to manipulate + * @iova: IO virtual address to cut at + * @gfp: GFP flags for any memory allocations + * + * If map was used on [iova_a, iova_b] then unmap must be used on the + * same interval. When called twice this is useful to unmap a portion of + * a larger mapping. + * + * cut_mapping() changes the page table so that umap of both: + * [iova_a, iova_c - 1] + * [iova_c, iova_b] + * will work. + * + * In practice this is done by breaking up large pages into smaller + * pages so that no large page crosses iova_c. + * + * cut_mapping() works to ensure all page sizes that don't cross the cut + * remain at the optimal sizes. + * + * Context: The caller must hold a write range lock that includes the + * entire range used with the map that contains iova. + * + * Returns: -ERRNO on failure, 0 on success. + */ + int (*cut_mapping)(struct pt_iommu *iommu_table, dma_addr_t cut_iova, + gfp_t gfp); + /** * iova_to_phys() - Return the output address for the given IOVA * @iommu_table: Table to query -- 2.46.0