Re: [PATCH rdma-next 1/4] IB/core: Improve ODP to use hmm_range_fault()

Christoph Hellwig <hch@xxxxxxxxxxxxx> · Wed, 16 Sep 2020 17:45:16 +0100

> In addition,
> Moving to use the HMM enables to reduce page faults in the system as it
> exposes the snapshot mode, this will be introduced in next patches from
> this series.
> 
> As part of this cleanup some flows and use the required data structures
> to work with HMM.

Just saying HMM here seems weird.  The function is hmm_range_fault.
And it really needs to grow a better name eventually..

>  		unsigned long start;
>  		unsigned long end;
> -		size_t pages;
> +		size_t ndmas, npfns;
>  
>  		start = ALIGN_DOWN(umem_odp->umem.address, page_size);
>  		if (check_add_overflow(umem_odp->umem.address,
> @@ -71,20 +72,21 @@ static inline int ib_init_umem_odp(struct ib_umem_odp *umem_odp,
>  		if (unlikely(end < page_size))
>  			return -EOVERFLOW;
>  
> -		pages = (end - start) >> umem_odp->page_shift;
> -		if (!pages)
> +		ndmas = (end - start) >> umem_odp->page_shift;
> +		if (!ndmas)
>  			return -EINVAL;
>  
> -		umem_odp->page_list = kvcalloc(
> -			pages, sizeof(*umem_odp->page_list), GFP_KERNEL);
> -		if (!umem_odp->page_list)
> +		npfns = (end - start) >> PAGE_SHIFT;
> +		umem_odp->pfn_list = kvcalloc(
> +			npfns, sizeof(*umem_odp->pfn_list), GFP_KERNEL);
> +		if (!umem_odp->pfn_list)
>  			return -ENOMEM;
>  
>  		umem_odp->dma_list = kvcalloc(
> -			pages, sizeof(*umem_odp->dma_list), GFP_KERNEL);
> +			ndmas, sizeof(*umem_odp->dma_list), GFP_KERNEL);
>  		if (!umem_odp->dma_list) {
>  			ret = -ENOMEM;
> -			goto out_page_list;
> +			goto out_pfn_list;

Why do you rename these variables?  We're still mapping pages.

>  static int ib_umem_odp_map_dma_single_page(
>  		struct ib_umem_odp *umem_odp,
> +		unsigned int dma_index,
>  		struct page *page,
> +		u64 access_mask)
>  {
>  	struct ib_device *dev = umem_odp->umem.ibdev;
>  	dma_addr_t dma_addr;
>  
> +	if (umem_odp->dma_list[dma_index]) {

Note that 0 is a valid DMA address.  I think due the access bit this
works, but it is a little subtle..

> +		umem_odp->dma_list[dma_index] &= ODP_DMA_ADDR_MASK;
> +		umem_odp->dma_list[dma_index] |= access_mask;

This looks a little weird.  Instead of &= ODP_DMA_ADDR_MASK I'd do a
&= ~ACCESS_MASK as that makes the code a lot more logical.

But more importantly except for (dma_addr_t)-1 (DMA_MAPPING_ERROR)
all dma_addr_t values are valid, so taking more than a single bit
from a dma_addr_t is not strictly speaking correct.

> +		return 0;
>  	}
>  
> +	dma_addr =
> +		ib_dma_map_page(dev, page, 0, BIT(umem_odp->page_shift),
> +				DMA_BIDIRECTIONAL);

The use of the BIT macro which is already obsfucating in its intended
place is really out of place here.

> +	if (ib_dma_mapping_error(dev, dma_addr))
> +		return -EFAULT;
> +
> +	umem_odp->dma_list[dma_index] = dma_addr | access_mask;
> +	umem_odp->npages++;
> +	return 0;

I'd change the calling conventions and write the whole thing as:

static int ib_umem_odp_map_dma_single_page(struct ib_umem_odp *umem_odp
		unsigned int dma_index, struct page *page, u64 access_mask)
{
 	struct ib_device *dev = umem_odp->umem.ibdev;
	dma_addr_t *dma_addr = &umem_odp->dma_list[dma_index];

	if (!dma_addr) {
		*dma_addr = ib_dma_map_page(dev, page, 0,
				1 << umem_odp->page_shift,
				DMA_BIDIRECTIONAL);
		if (ib_dma_mapping_error(dev, *dma_addr))
			return -EFAULT;
		umem_odp->npages++;
	}
	*dma_addr &= ~ODP_ACCESS_MASK;
	*dma_addr |= access_mask;
	return 0;
}

> +	/*
> +	 * No need to check whether the MTTs really belong to
> +	 * this MR, since ib_umem_odp_map_dma_and_lock already
> +	 * checks this.
> +	 */

You could easily squeeze the three lines into two while still staying
under 80 characters for each line.