Hi Mike, > > On 07/18/23 01:26, Vivek Kasireddy wrote: > > A user or admin can configure a VMM (Qemu) Guest's memory to be > > backed by hugetlb pages for various reasons. However, a Guest OS > > would still allocate (and pin) buffers that are backed by regular > > 4k sized pages. In order to map these buffers and create dma-bufs > > for them on the Host, we first need to find the hugetlb pages where > > the buffer allocations are located and then determine the offsets > > of individual chunks (within those pages) and use this information > > to eventually populate a scatterlist. > > > > Testcase: default_hugepagesz=2M hugepagesz=2M hugepages=2500 > options > > were passed to the Host kernel and Qemu was launched with these > > relevant options: qemu-system-x86_64 -m 4096m.... > > -device virtio-gpu-pci,max_outputs=1,blob=true,xres=1920,yres=1080 > > -display gtk,gl=on > > -object memory-backend-memfd,hugetlb=on,id=mem1,size=4096M > > -machine memory-backend=mem1 > > > > Replacing -display gtk,gl=on with -display gtk,gl=off above would > > exercise the mmap handler. > > > > v2: Updated get_sg_table() to manually populate the scatterlist for > > both huge page and non-huge-page cases. > > > > Cc: David Hildenbrand <david@xxxxxxxxxx> > > Cc: Mike Kravetz <mike.kravetz@xxxxxxxxxx> > > Cc: Hugh Dickins <hughd@xxxxxxxxxx> > > Cc: Peter Xu <peterx@xxxxxxxxxx> > > Cc: Jason Gunthorpe <jgg@xxxxxxxxxx> > > Cc: Gerd Hoffmann <kraxel@xxxxxxxxxx> > > Cc: Dongwon Kim <dongwon.kim@xxxxxxxxx> > > Cc: Junxiao Chang <junxiao.chang@xxxxxxxxx> > > Signed-off-by: Vivek Kasireddy <vivek.kasireddy@xxxxxxxxx> > > --- > > drivers/dma-buf/udmabuf.c | 84 +++++++++++++++++++++++++++++++++-- > ---- > > 1 file changed, 71 insertions(+), 13 deletions(-) > > > > diff --git a/drivers/dma-buf/udmabuf.c b/drivers/dma-buf/udmabuf.c > > index 820c993c8659..10c47bf77fb5 100644 > > --- a/drivers/dma-buf/udmabuf.c > > +++ b/drivers/dma-buf/udmabuf.c > > @@ -10,6 +10,7 @@ > > #include <linux/miscdevice.h> > > #include <linux/module.h> > > #include <linux/shmem_fs.h> > > +#include <linux/hugetlb.h> > > #include <linux/slab.h> > > #include <linux/udmabuf.h> > > #include <linux/vmalloc.h> > > @@ -28,6 +29,7 @@ struct udmabuf { > > struct page **pages; > > struct sg_table *sg; > > struct miscdevice *device; > > + pgoff_t *offsets; > > }; > > > > static vm_fault_t udmabuf_vm_fault(struct vm_fault *vmf) > > @@ -41,6 +43,10 @@ static vm_fault_t udmabuf_vm_fault(struct vm_fault > *vmf) > > return VM_FAULT_SIGBUS; > > > > pfn = page_to_pfn(ubuf->pages[pgoff]); > > + if (ubuf->offsets) { > > + pfn += ubuf->offsets[pgoff] >> PAGE_SHIFT; > > + } > > + > > return vmf_insert_pfn(vma, vmf->address, pfn); > > } > > > > @@ -90,23 +96,31 @@ static struct sg_table *get_sg_table(struct device > *dev, struct dma_buf *buf, > > { > > struct udmabuf *ubuf = buf->priv; > > struct sg_table *sg; > > + struct scatterlist *sgl; > > + pgoff_t offset; > > + unsigned long i = 0; > > int ret; > > > > sg = kzalloc(sizeof(*sg), GFP_KERNEL); > > if (!sg) > > return ERR_PTR(-ENOMEM); > > - ret = sg_alloc_table_from_pages(sg, ubuf->pages, ubuf->pagecount, > > - 0, ubuf->pagecount << PAGE_SHIFT, > > - GFP_KERNEL); > > + > > + ret = sg_alloc_table(sg, ubuf->pagecount, GFP_KERNEL); > > if (ret < 0) > > - goto err; > > + goto err_alloc; > > + > > + for_each_sg(sg->sgl, sgl, ubuf->pagecount, i) { > > + offset = ubuf->offsets ? ubuf->offsets[i] : 0; > > + sg_set_page(sgl, ubuf->pages[i], PAGE_SIZE, offset); > > + } > > ret = dma_map_sgtable(dev, sg, direction, 0); > > if (ret < 0) > > - goto err; > > + goto err_map; > > return sg; > > > > -err: > > +err_map: > > sg_free_table(sg); > > +err_alloc: > > kfree(sg); > > return ERR_PTR(ret); > > } > > @@ -143,6 +157,7 @@ static void release_udmabuf(struct dma_buf *buf) > > > > for (pg = 0; pg < ubuf->pagecount; pg++) > > put_page(ubuf->pages[pg]); > > + kfree(ubuf->offsets); > > kfree(ubuf->pages); > > kfree(ubuf); > > } > > @@ -206,7 +221,9 @@ static long udmabuf_create(struct miscdevice > *device, > > struct udmabuf *ubuf; > > struct dma_buf *buf; > > pgoff_t pgoff, pgcnt, pgidx, pgbuf = 0, pglimit; > > - struct page *page; > > + struct page *page, *hpage = NULL; > > + pgoff_t hpoff, chunkoff, maxchunks; > > + struct hstate *hpstate; > > int seals, ret = -EINVAL; > > u32 i, flags; > > > > @@ -242,7 +259,7 @@ static long udmabuf_create(struct miscdevice > *device, > > if (!memfd) > > goto err; > > mapping = memfd->f_mapping; > > - if (!shmem_mapping(mapping)) > > + if (!shmem_mapping(mapping) && > !is_file_hugepages(memfd)) > > goto err; > > seals = memfd_fcntl(memfd, F_GET_SEALS, 0); > > if (seals == -EINVAL) > > @@ -253,16 +270,56 @@ static long udmabuf_create(struct miscdevice > *device, > > goto err; > > pgoff = list[i].offset >> PAGE_SHIFT; > > pgcnt = list[i].size >> PAGE_SHIFT; > > + if (is_file_hugepages(memfd)) { > > + if (!ubuf->offsets) { > > + ubuf->offsets = kmalloc_array(ubuf- > >pagecount, > > + sizeof(*ubuf- > >offsets), > > + GFP_KERNEL); > > + if (!ubuf->offsets) { > > + ret = -ENOMEM; > > + goto err; > > + } > > + } > > + hpstate = hstate_file(memfd); > > + hpoff = list[i].offset >> huge_page_shift(hpstate); > > + chunkoff = (list[i].offset & > > + ~huge_page_mask(hpstate)) >> > PAGE_SHIFT; > > + maxchunks = huge_page_size(hpstate) >> > PAGE_SHIFT; > > + } > > for (pgidx = 0; pgidx < pgcnt; pgidx++) { > > - page = shmem_read_mapping_page(mapping, pgoff > + pgidx); > > - if (IS_ERR(page)) { > > - ret = PTR_ERR(page); > > - goto err; > > + if (is_file_hugepages(memfd)) { > > + if (!hpage) { > > + hpage = > find_get_page_flags(mapping, hpoff, > > + > FGP_ACCESSED); > > + if (!hpage) { > > + ret = -EINVAL; > > + goto err; > > + } > > + } > > + get_page(hpage); > > Is the intention to increase the ref count of the hugetlb page once for > each 'sub-page' added? Or, am I reading that incorrectly? Yes, that's the intention; I figured it makes sense to do it that way. > > > + ubuf->pages[pgbuf] = hpage; > > Ah!, answering my own question. Since the 'head page' is added to the > array the ref count of the head page will decremented in > release_udmabuf. Yes, that's right. > > > + ubuf->offsets[pgbuf++] = chunkoff << > PAGE_SHIFT; > > + if (++chunkoff == maxchunks) { > > + put_page(hpage); > > + hpage = NULL; > > + chunkoff = 0; > > + hpoff++; > > + } > > + } else { > > + page = > shmem_read_mapping_page(mapping, pgoff + pgidx); > > It may not matter to your users, but the semantics for hugetlb and shmem > pages is different. hugetlb requires the pages exist in the page cache > while shmem will create/add pages to the cache if necessary. Ok; got it. > > > + if (IS_ERR(page)) { > > + ret = PTR_ERR(page); > > + goto err; > > + } > > + ubuf->pages[pgbuf++] = page; > > } > > - ubuf->pages[pgbuf++] = page; > > } > > fput(memfd); > > memfd = NULL; > > + if (hpage) { > > + put_page(hpage); > > + hpage = NULL; > > + } > > } > > > > exp_info.ops = &udmabuf_ops; > > @@ -287,6 +344,7 @@ static long udmabuf_create(struct miscdevice > *device, > > put_page(ubuf->pages[--pgbuf]); > > if (memfd) > > fput(memfd); > > + kfree(ubuf->offsets); > > kfree(ubuf->pages); > > kfree(ubuf); > > return ret; > > -- > > 2.39.2 > > > > Nothing else stands out, > > Acked-by: Mike Kravetz <mike.kravetz@xxxxxxxxxx> Thank you for the quick review! > > I see there is a RFC for the coherency issue with hole punch. Yes. Thanks, Vivek > -- > Mike Kravetz