On Tue, 08 Nov 2016 10:20:14 +0800 Jike Song <jike.song@xxxxxxxxx> wrote: > On 11/08/2016 07:16 AM, Alex Williamson wrote: > > On Sat, 5 Nov 2016 02:40:44 +0530 > > Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote: > > > >> VFIO IOMMU drivers are designed for the devices which are IOMMU capable. > >> Mediated device only uses IOMMU APIs, the underlying hardware can be > >> managed by an IOMMU domain. > >> > >> Aim of this change is: > >> - To use most of the code of TYPE1 IOMMU driver for mediated devices > >> - To support direct assigned device and mediated device in single module > >> > >> This change adds pin and unpin support for mediated device to TYPE1 IOMMU > >> backend module. More details: > >> - vfio_pin_pages() callback here uses task and address space of vfio_dma, > >> that is, of the process who mapped that iova range. > >> - Added pfn_list tracking logic to address space structure. All pages > >> pinned through this interface are trached in its address space. > > ^ k > > ------------------------------------------| > > > >> - Pinned pages list is used to verify unpinning request and to unpin > >> remaining pages while detaching the group for that device. > >> - Page accounting is updated to account in its address space where the > >> pages are pinned/unpinned. > >> - Accouting for mdev device is only done if there is no iommu capable > >> domain in the container. When there is a direct device assigned to the > >> container and that domain is iommu capable, all pages are already pinned > >> during DMA_MAP. > >> - Page accouting is updated on hot plug and unplug mdev device and pass > >> through device. > >> > >> Tested by assigning below combinations of devices to a single VM: > >> - GPU pass through only > >> - vGPU device only > >> - One GPU pass through and one vGPU device > >> - Linux VM hot plug and unplug vGPU device while GPU pass through device > >> exist > >> - Linux VM hot plug and unplug GPU pass through device while vGPU device > >> exist > >> > >> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx> > >> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx> > >> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a > >> --- > >> drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++--- > >> 1 file changed, 500 insertions(+), 38 deletions(-) > >> > >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c > >> index 8d64528dcc22..e511073446a0 100644 > >> --- a/drivers/vfio/vfio_iommu_type1.c > >> +++ b/drivers/vfio/vfio_iommu_type1.c > >> @@ -36,6 +36,7 @@ > >> #include <linux/uaccess.h> > >> #include <linux/vfio.h> > >> #include <linux/workqueue.h> > >> +#include <linux/mdev.h> > >> > >> #define DRIVER_VERSION "0.2" > >> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@xxxxxxxxxx>" > >> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages, > >> struct vfio_iommu { > >> struct list_head domain_list; > >> struct list_head addr_space_list; > >> + struct vfio_domain *external_domain; /* domain for external user */ > >> struct mutex lock; > >> struct rb_root dma_list; > >> bool v2; > >> @@ -67,6 +69,9 @@ struct vfio_addr_space { > >> struct mm_struct *mm; > >> struct list_head next; > >> atomic_t ref_count; > >> + /* external user pinned pfns */ > >> + struct rb_root pfn_list; /* pinned Host pfn list */ > >> + struct mutex pfn_list_lock; /* mutex for pfn_list */ > >> }; > >> > >> struct vfio_domain { > >> @@ -83,6 +88,7 @@ struct vfio_dma { > >> unsigned long vaddr; /* Process virtual addr */ > >> size_t size; /* Map size (bytes) */ > >> int prot; /* IOMMU_READ/WRITE */ > >> + bool iommu_mapped; > >> struct vfio_addr_space *addr_space; > >> struct task_struct *task; > >> bool mlock_cap; > >> @@ -94,6 +100,19 @@ struct vfio_group { > >> }; > >> > >> /* > >> + * Guest RAM pinning working set or DMA target > >> + */ > >> +struct vfio_pfn { > >> + struct rb_node node; > >> + unsigned long pfn; /* Host pfn */ > >> + int prot; > >> + atomic_t ref_count; > >> +}; > >> + > >> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ > >> + (!list_empty(&iommu->domain_list)) > >> + > >> +/* > >> * This code handles mapping and unmapping of user data buffers > >> * into DMA'ble space using the IOMMU > >> */ > >> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu, > >> return NULL; > >> } > >> > >> +/* > >> + * Helper Functions for host pfn list > >> + */ > >> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space, > >> + unsigned long pfn) > >> +{ > >> + struct vfio_pfn *vpfn; > >> + struct rb_node *node = addr_space->pfn_list.rb_node; > >> + > >> + while (node) { > >> + vpfn = rb_entry(node, struct vfio_pfn, node); > >> + > >> + if (pfn < vpfn->pfn) > >> + node = node->rb_left; > >> + else if (pfn > vpfn->pfn) > >> + node = node->rb_right; > >> + else > >> + return vpfn; > >> + } > >> + > >> + return NULL; > >> +} > >> + > >> +static void vfio_link_pfn(struct vfio_addr_space *addr_space, > >> + struct vfio_pfn *new) > >> +{ > >> + struct rb_node **link, *parent = NULL; > >> + struct vfio_pfn *vpfn; > >> + > >> + link = &addr_space->pfn_list.rb_node; > >> + while (*link) { > >> + parent = *link; > >> + vpfn = rb_entry(parent, struct vfio_pfn, node); > >> + > >> + if (new->pfn < vpfn->pfn) > >> + link = &(*link)->rb_left; > >> + else > >> + link = &(*link)->rb_right; > >> + } > >> + > >> + rb_link_node(&new->node, parent, link); > >> + rb_insert_color(&new->node, &addr_space->pfn_list); > >> +} > >> + > >> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space, > >> + struct vfio_pfn *old) > >> +{ > >> + rb_erase(&old->node, &addr_space->pfn_list); > >> +} > >> + > >> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space, > >> + unsigned long pfn, int prot) > >> +{ > >> + struct vfio_pfn *vpfn; > >> + > >> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); > >> + if (!vpfn) > >> + return -ENOMEM; > >> + > >> + vpfn->pfn = pfn; > >> + vpfn->prot = prot; > >> + atomic_set(&vpfn->ref_count, 1); > >> + vfio_link_pfn(addr_space, vpfn); > >> + return 0; > >> +} > >> + > >> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space, > >> + struct vfio_pfn *vpfn) > >> +{ > >> + vfio_unlink_pfn(addr_space, vpfn); > >> + kfree(vpfn); > >> +} > >> + > >> +static int vfio_pfn_account(struct vfio_addr_space *addr_space, > >> + unsigned long pfn) > >> +{ > >> + struct vfio_pfn *p; > >> + int ret = 1; > >> + > >> + mutex_lock(&addr_space->pfn_list_lock); > >> + p = vfio_find_pfn(addr_space, pfn); > >> + if (p) > >> + ret = 0; > >> + mutex_unlock(&addr_space->pfn_list_lock); > >> + return ret; > >> +} > >> + > >> struct vwork { > >> struct mm_struct *mm; > >> long npage; > >> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > >> unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > >> bool lock_cap = dma->mlock_cap; > >> struct mm_struct *mm = dma->addr_space->mm; > >> - long ret, i; > >> + long ret, i, lock_acct; > >> bool rsvd; > >> > >> ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); > >> if (ret) > >> return ret; > >> > >> + lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base); > >> + > >> rsvd = is_invalid_reserved_pfn(*pfn_base); > >> > >> - if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { > >> + if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) { > >> put_pfn(*pfn_base, prot); > >> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, > >> limit << PAGE_SHIFT); > >> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > >> break; > >> } > >> > >> + lock_acct += vfio_pfn_account(dma->addr_space, pfn); > >> + > >> if (!rsvd && !lock_cap && > >> - mm->locked_vm + i + 1 > limit) { > >> + mm->locked_vm + lock_acct + 1 > limit) { > >> put_pfn(pfn, prot); > >> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", > >> __func__, limit << PAGE_SHIFT); > >> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, > >> } > >> > >> if (!rsvd) > >> - vfio_lock_acct(mm, i); > >> + vfio_lock_acct(mm, lock_acct); > >> > >> return i; > >> } > >> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn, > >> return unlocked; > >> } > >> > >> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) > >> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, > >> + int prot, unsigned long *pfn_base, > >> + bool do_accounting) > >> +{ > >> + struct task_struct *task = dma->task; > >> + unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; > >> + bool lock_cap = dma->mlock_cap; > >> + struct mm_struct *mm = dma->addr_space->mm; > >> + int ret; > >> + bool rsvd; > >> + > >> + ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); > >> + if (ret) > >> + return ret; > >> + > >> + rsvd = is_invalid_reserved_pfn(*pfn_base); > >> + > >> + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { > >> + put_pfn(*pfn_base, prot); > >> + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", > >> + __func__, task->comm, task_pid_nr(task), > >> + limit << PAGE_SHIFT); > >> + return -ENOMEM; > >> + } > >> + > >> + if (!rsvd && do_accounting) > >> + vfio_lock_acct(mm, 1); > >> + > >> + return 1; > >> +} > >> + > >> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space, > >> + unsigned long pfn, int prot, > >> + bool do_accounting) > >> +{ > >> + put_pfn(pfn, prot); > >> + > >> + if (do_accounting) > >> + vfio_lock_acct(addr_space->mm, -1); > > > > Can't we batch this like we do elsewhere? Intel folks, AIUI you intend > > to pin all VM memory through this side channel, have you tested the > > scalability and performance of this with larger VMs? Our vfio_pfn > > data structure alone is 40 bytes per pinned page, which means for > > each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn! > > Additionally, unmapping each 1GB of VM memory will result in 256k > > separate vfio_lock_acct() callbacks. I'm concerned that we're not > > being efficient enough in either space or time. > > Hi Alex, > > Sorry for being confusing, Intel vGPU actually doesn't necessarily need > to pin all guest memory. A vGPU has its page table (GTT), whose access > is trapped. Whenever guest driver wants to specify a page for DMA, it > writes the GTT entry - thereby we could know the event and pin that > page only. > > Performance data will be shared once available. Thanks :) Ok, so maybe if none of the initial users are mapping full VM memory then scalability can be saved for future optimization. Thanks, Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html