Re: [PATCH rdma-next v4 10/12] RDMA/efa: Add EFA verbs implementation

Gal Pressman <galpress@xxxxxxxxxx> · Thu, 28 Mar 2019 19:20:33 +0200

On 28-Mar-19 15:08, Jason Gunthorpe wrote:
> On Thu, Mar 28, 2019 at 02:39:30PM +0200, Gal Pressman wrote:
> 
>> +static int umem_to_page_list(struct efa_dev *dev,
>> +			     struct ib_umem *umem,
>> +			     u64 *page_list,
>> +			     u32 hp_cnt,
>> +			     u8 hp_shift)
>> +{
>> +	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
>> +	unsigned int page_idx = 0;
>> +	unsigned int hp_idx = 0;
>> +	struct scatterlist *sg;
>> +	unsigned int entry;
>> +
>> +	if (umem->page_shift != PAGE_SHIFT) {
>> +		ibdev_dbg(&dev->ibdev,
>> +			  "umem invalid page shift %d\n", umem->page_shift);
>> +		return -EINVAL;
>> +	}
>> +
>> +	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
>> +		  hp_cnt, pages_in_hp);
>> +
>> +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
>> +		if (sg_dma_len(sg) != PAGE_SIZE) {
>> +			ibdev_dbg(&dev->ibdev,
>> +				  "sg_dma_len[%u] != PAGE_SIZE[%lu]\n",
>> +				  sg_dma_len(sg), PAGE_SIZE);
>> +			return -EINVAL;
>> +		}
> 
> Again, this needs to work with Shiraz's work, driver's can't assume the
> umem sgl has PAGE_SIZE chunks. Use for_each_sg_dma_page

Thanks, I have a couple of questions:

I'm not entirely up to date with Shiraz's work, wanna make sure I understand:
Currently the umem sgl is built out of PAGE_SIZE elements. The reason we can't
assume each element is PAGE_SIZE long is because the DMA mapping (iommu) might
combine multiple elements or because Shiraz's work is going to combine the
elements manually?

Reading the documentation, sg_page_iter kinda looks like it covers
sg_dma_page_iter functionality? Don't they both retrieve the DMA address page by
page? Looks like the only difference is that in sg_page_iter I can get the
struct page as well?

> 
>> +static void efa_cont_pages(struct ib_umem *umem, u64 addr,
>> +			   unsigned long max_page_shift,
>> +			   int *count, u8 *shift, u32 *ncont)
>> +{
>> +	unsigned long page_shift = umem->page_shift;
>> +	struct scatterlist *sg;
>> +	u64 base = ~0, p = 0;
>> +	unsigned long tmp;
>> +	unsigned long m;
>> +	u64 len, pfn;
>> +	int i = 0;
>> +	int entry;
>> +
>> +	addr = addr >> page_shift;
>> +	tmp = (unsigned long)addr;
>> +	m = find_first_bit(&tmp, BITS_PER_LONG);
>> +	if (max_page_shift)
>> +		m = min_t(unsigned long, max_page_shift - page_shift, m);
>> +
>> +	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
>> +		len = sg_dma_len(sg) >> page_shift;
> 
> Shouldn't assume the offset is zero here.

Because elements are not necessarily PAGE_SIZE?
If page combining is already done by ib_core, this whole function can probably
be deleted.

> 
>> +		pfn = sg_dma_address(sg) >> page_shift;
>> +		if (base + p != pfn) {
>> +			/*
>> +			 * If either the offset or the new
>> +			 * base are unaligned update m
>> +			 */
>> +			tmp = (unsigned long)(pfn | p);
>> +			if (!IS_ALIGNED(tmp, 1 << m))
>> +				m = find_first_bit(&tmp, BITS_PER_LONG);
>> +
>> +			base = pfn;
>> +			p = 0;
>> +		}
>> +
>> +		p += len;
>> +		i += len;
>> +	}
>> +
>> +	if (i) {
>> +		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
>> +		*ncont = DIV_ROUND_UP(i, (1 << m));
>> +	} else {
>> +		m = 0;
>> +		*ncont = 0;
>> +	}
>> +
>> +	*shift = page_shift + m;
>> +	*count = i;
>> +}
> 
> This whole function should really better match what Shiraz is
> doing. It looks like it is just ib_umem_find_single_pg_size() ??

Yea, I think ib_umem_find_single_pg_size() could probably replace this whole
function, this work isn't merged yet though right?

> 
>> +int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
>> +{
>> +	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
>> +	struct efa_dev *dev = to_edev(ibucontext->device);
>> +	struct efa_ibv_alloc_ucontext_resp resp = {};
>> +	struct efa_com_alloc_uar_result result;
>> +	int err;
>> +
>> +	/*
>> +	 * it's fine if the driver does not know all request fields,
>> +	 * we will ack input fields in our response.
>> +	 */
>> +
>> +	err = efa_com_alloc_uar(&dev->edev, &result);
>> +	if (err)
>> +		goto err_out;
>> +
>> +	ucontext->uarn = result.uarn;
>> +	xa_init_flags(&ucontext->mmap_xa, XA_FLAGS_ALLOC);
>> +
>> +	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
>> +	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
>> +	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
>> +	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
>> +	resp.max_llq_size = dev->dev_attr.max_llq_size;
>> +
>> +	if (udata && udata->outlen) {
> 
> That is a weird test.. Why outlen?

I make sure the user is expecting data before calling ib_copy_to_udata().