From: Jason Gunthorpe <jgg@xxxxxxxxxxxx> This fixes a race of the form: CPU0 CPU1 mlx5_ib_invalidate_range() mlx5_ib_invalidate_range() // This one actually makes npages == 0 ib_umem_odp_unmap_dma_pages() if (npages == 0 && !dying) // This one does nothing ib_umem_odp_unmap_dma_pages() if (npages == 0 && !dying) dying = 1; dying = 1; schedule_work(&umem_odp->work); // Double schedule of the same work schedule_work(&umem_odp->work); // BOOM npages and dying must be read and written under the umem_mutex lock. Since whenever ib_umem_odp_unmap_dma_pages() is called mlx5 must also call mlx5_ib_update_xlt, and both need to be done in the same locking region, hoist the lock out of unmap. This avoids an expensive double critical section in mlx5_ib_invalidate_range(). Fixes: 81713d3788d2 ("IB/mlx5: Add implicit MR support") Reviewed-by: Artemy Kovalyov <artemyko@xxxxxxxxxxxx> Signed-off-by: Jason Gunthorpe <jgg@xxxxxxxxxxxx> --- drivers/infiniband/core/umem_odp.c | 6 ++++-- drivers/infiniband/hw/mlx5/odp.c | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index f67a30fda1ed9a..163ff7ba92b7f1 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -451,8 +451,10 @@ void ib_umem_odp_release(struct ib_umem_odp *umem_odp) * that the hardware will not attempt to access the MR any more. */ if (!umem_odp->is_implicit_odp) { + mutex_lock(&umem_odp->umem_mutex); ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); + mutex_unlock(&umem_odp->umem_mutex); kvfree(umem_odp->dma_list); kvfree(umem_odp->page_list); } @@ -719,6 +721,8 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, u64 addr; struct ib_device *dev = umem_odp->umem.ibdev; + lockdep_assert_held(&umem_odp->umem_mutex); + virt = max_t(u64, virt, ib_umem_start(umem_odp)); bound = min_t(u64, bound, ib_umem_end(umem_odp)); /* Note that during the run of this function, the @@ -726,7 +730,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, * faults from completion. We might be racing with other * invalidations, so we must make sure we free each page only * once. */ - mutex_lock(&umem_odp->umem_mutex); for (addr = virt; addr < bound; addr += BIT(umem_odp->page_shift)) { idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift; if (umem_odp->page_list[idx]) { @@ -757,7 +760,6 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt, umem_odp->npages--; } } - mutex_unlock(&umem_odp->umem_mutex); } EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index 3401c06b7e54f5..1930d78c3091cb 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -308,7 +308,6 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, idx - blk_start_idx + 1, 0, MLX5_IB_UPD_XLT_ZAP | MLX5_IB_UPD_XLT_ATOMIC); - mutex_unlock(&umem_odp->umem_mutex); /* * We are now sure that the device will not access the * memory. We can safely unmap it, and mark it as dirty if @@ -319,10 +318,11 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start, if (unlikely(!umem_odp->npages && mr->parent && !umem_odp->dying)) { - WRITE_ONCE(umem_odp->dying, 1); + umem_odp->dying = 1; atomic_inc(&mr->parent->num_leaf_free); schedule_work(&umem_odp->work); } + mutex_unlock(&umem_odp->umem_mutex); } void mlx5_ib_internal_fill_odp_caps(struct mlx5_ib_dev *dev) @@ -585,15 +585,19 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) if (mr->parent != imr) continue; + mutex_lock(&umem_odp->umem_mutex); ib_umem_odp_unmap_dma_pages(umem_odp, ib_umem_start(umem_odp), ib_umem_end(umem_odp)); - if (umem_odp->dying) + if (umem_odp->dying) { + mutex_unlock(&umem_odp->umem_mutex); continue; + } - WRITE_ONCE(umem_odp->dying, 1); + umem_odp->dying = 1; atomic_inc(&imr->num_leaf_free); schedule_work(&umem_odp->work); + mutex_unlock(&umem_odp->umem_mutex); } up_read(&per_mm->umem_rwsem); -- 2.23.0