On 11/08/2016 07:16 AM, Alex Williamson wrote: > On Sat, 5 Nov 2016 02:40:44 +0530 > Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote: > >> VFIO IOMMU drivers are designed for the devices which are IOMMU capable. >> Mediated device only uses IOMMU APIs, the underlying hardware can be >> managed by an IOMMU domain. >> >> Aim of this change is: >> - To use most of the code of TYPE1 IOMMU driver for mediated devices >> - To support direct assigned device and mediated device in single module >> >> This change adds pin and unpin support for mediated device to TYPE1 IOMMU >> backend module. More details: >> - vfio_pin_pages() callback here uses task and address space of vfio_dma, >> that is, of the process who mapped that iova range. >> - Added pfn_list tracking logic to address space structure. All pages >> pinned through this interface are trached in its address space. > ^ k > ------------------------------------------| > >> - Pinned pages list is used to verify unpinning request and to unpin >> remaining pages while detaching the group for that device. >> - Page accounting is updated to account in its address space where the >> pages are pinned/unpinned. >> - Accouting for mdev device is only done if there is no iommu capable >> domain in the container. When there is a direct device assigned to the >> container and that domain is iommu capable, all pages are already pinned >> during DMA_MAP. >> - Page accouting is updated on hot plug and unplug mdev device and pass >> through device. >> >> Tested by assigning below combinations of devices to a single VM: >> - GPU pass through only >> - vGPU device only >> - One GPU pass through and one vGPU device >> - Linux VM hot plug and unplug vGPU device while GPU pass through device >> exist >> - Linux VM hot plug and unplug GPU pass through device while vGPU device >> exist >> >> Signed-off-by: Kirti Wankhede <kwankhede@xxxxxxxxxx> >> Signed-off-by: Neo Jia <cjia@xxxxxxxxxx> >> Change-Id: I295d6f0f2e0579b8d9882bfd8fd5a4194b97bd9a >> --- >> drivers/vfio/vfio_iommu_type1.c | 538 +++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 500 insertions(+), 38 deletions(-) >> >> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c >> index 8d64528dcc22..e511073446a0 100644 >> --- a/drivers/vfio/vfio_iommu_type1.c >> +++ b/drivers/vfio/vfio_iommu_type1.c >> @@ -36,6 +36,7 @@ >> #include <linux/uaccess.h> >> #include <linux/vfio.h> >> #include <linux/workqueue.h> >> +#include <linux/mdev.h> >> >> #define DRIVER_VERSION "0.2" >> #define DRIVER_AUTHOR "Alex Williamson <alex.williamson@xxxxxxxxxx>" >> @@ -56,6 +57,7 @@ MODULE_PARM_DESC(disable_hugepages, >> struct vfio_iommu { >> struct list_head domain_list; >> struct list_head addr_space_list; >> + struct vfio_domain *external_domain; /* domain for external user */ >> struct mutex lock; >> struct rb_root dma_list; >> bool v2; >> @@ -67,6 +69,9 @@ struct vfio_addr_space { >> struct mm_struct *mm; >> struct list_head next; >> atomic_t ref_count; >> + /* external user pinned pfns */ >> + struct rb_root pfn_list; /* pinned Host pfn list */ >> + struct mutex pfn_list_lock; /* mutex for pfn_list */ >> }; >> >> struct vfio_domain { >> @@ -83,6 +88,7 @@ struct vfio_dma { >> unsigned long vaddr; /* Process virtual addr */ >> size_t size; /* Map size (bytes) */ >> int prot; /* IOMMU_READ/WRITE */ >> + bool iommu_mapped; >> struct vfio_addr_space *addr_space; >> struct task_struct *task; >> bool mlock_cap; >> @@ -94,6 +100,19 @@ struct vfio_group { >> }; >> >> /* >> + * Guest RAM pinning working set or DMA target >> + */ >> +struct vfio_pfn { >> + struct rb_node node; >> + unsigned long pfn; /* Host pfn */ >> + int prot; >> + atomic_t ref_count; >> +}; >> + >> +#define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \ >> + (!list_empty(&iommu->domain_list)) >> + >> +/* >> * This code handles mapping and unmapping of user data buffers >> * into DMA'ble space using the IOMMU >> */ >> @@ -153,6 +172,93 @@ static struct vfio_addr_space *vfio_find_addr_space(struct vfio_iommu *iommu, >> return NULL; >> } >> >> +/* >> + * Helper Functions for host pfn list >> + */ >> +static struct vfio_pfn *vfio_find_pfn(struct vfio_addr_space *addr_space, >> + unsigned long pfn) >> +{ >> + struct vfio_pfn *vpfn; >> + struct rb_node *node = addr_space->pfn_list.rb_node; >> + >> + while (node) { >> + vpfn = rb_entry(node, struct vfio_pfn, node); >> + >> + if (pfn < vpfn->pfn) >> + node = node->rb_left; >> + else if (pfn > vpfn->pfn) >> + node = node->rb_right; >> + else >> + return vpfn; >> + } >> + >> + return NULL; >> +} >> + >> +static void vfio_link_pfn(struct vfio_addr_space *addr_space, >> + struct vfio_pfn *new) >> +{ >> + struct rb_node **link, *parent = NULL; >> + struct vfio_pfn *vpfn; >> + >> + link = &addr_space->pfn_list.rb_node; >> + while (*link) { >> + parent = *link; >> + vpfn = rb_entry(parent, struct vfio_pfn, node); >> + >> + if (new->pfn < vpfn->pfn) >> + link = &(*link)->rb_left; >> + else >> + link = &(*link)->rb_right; >> + } >> + >> + rb_link_node(&new->node, parent, link); >> + rb_insert_color(&new->node, &addr_space->pfn_list); >> +} >> + >> +static void vfio_unlink_pfn(struct vfio_addr_space *addr_space, >> + struct vfio_pfn *old) >> +{ >> + rb_erase(&old->node, &addr_space->pfn_list); >> +} >> + >> +static int vfio_add_to_pfn_list(struct vfio_addr_space *addr_space, >> + unsigned long pfn, int prot) >> +{ >> + struct vfio_pfn *vpfn; >> + >> + vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL); >> + if (!vpfn) >> + return -ENOMEM; >> + >> + vpfn->pfn = pfn; >> + vpfn->prot = prot; >> + atomic_set(&vpfn->ref_count, 1); >> + vfio_link_pfn(addr_space, vpfn); >> + return 0; >> +} >> + >> +static void vfio_remove_from_pfn_list(struct vfio_addr_space *addr_space, >> + struct vfio_pfn *vpfn) >> +{ >> + vfio_unlink_pfn(addr_space, vpfn); >> + kfree(vpfn); >> +} >> + >> +static int vfio_pfn_account(struct vfio_addr_space *addr_space, >> + unsigned long pfn) >> +{ >> + struct vfio_pfn *p; >> + int ret = 1; >> + >> + mutex_lock(&addr_space->pfn_list_lock); >> + p = vfio_find_pfn(addr_space, pfn); >> + if (p) >> + ret = 0; >> + mutex_unlock(&addr_space->pfn_list_lock); >> + return ret; >> +} >> + >> struct vwork { >> struct mm_struct *mm; >> long npage; >> @@ -304,16 +410,18 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, >> unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; >> bool lock_cap = dma->mlock_cap; >> struct mm_struct *mm = dma->addr_space->mm; >> - long ret, i; >> + long ret, i, lock_acct; >> bool rsvd; >> >> ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); >> if (ret) >> return ret; >> >> + lock_acct = vfio_pfn_account(dma->addr_space, *pfn_base); >> + >> rsvd = is_invalid_reserved_pfn(*pfn_base); >> >> - if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { >> + if (!rsvd && !lock_cap && mm->locked_vm + lock_acct > limit) { >> put_pfn(*pfn_base, prot); >> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, >> limit << PAGE_SHIFT); >> @@ -340,8 +448,10 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, >> break; >> } >> >> + lock_acct += vfio_pfn_account(dma->addr_space, pfn); >> + >> if (!rsvd && !lock_cap && >> - mm->locked_vm + i + 1 > limit) { >> + mm->locked_vm + lock_acct + 1 > limit) { >> put_pfn(pfn, prot); >> pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", >> __func__, limit << PAGE_SHIFT); >> @@ -350,7 +460,7 @@ static long __vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, >> } >> >> if (!rsvd) >> - vfio_lock_acct(mm, i); >> + vfio_lock_acct(mm, lock_acct); >> >> return i; >> } >> @@ -370,14 +480,214 @@ static long __vfio_unpin_pages_remote(struct vfio_dma *dma, unsigned long pfn, >> return unlocked; >> } >> >> -static void vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma) >> +static int __vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, >> + int prot, unsigned long *pfn_base, >> + bool do_accounting) >> +{ >> + struct task_struct *task = dma->task; >> + unsigned long limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; >> + bool lock_cap = dma->mlock_cap; >> + struct mm_struct *mm = dma->addr_space->mm; >> + int ret; >> + bool rsvd; >> + >> + ret = vaddr_get_pfn(mm, vaddr, prot, pfn_base); >> + if (ret) >> + return ret; >> + >> + rsvd = is_invalid_reserved_pfn(*pfn_base); >> + >> + if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { >> + put_pfn(*pfn_base, prot); >> + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", >> + __func__, task->comm, task_pid_nr(task), >> + limit << PAGE_SHIFT); >> + return -ENOMEM; >> + } >> + >> + if (!rsvd && do_accounting) >> + vfio_lock_acct(mm, 1); >> + >> + return 1; >> +} >> + >> +static void __vfio_unpin_page_external(struct vfio_addr_space *addr_space, >> + unsigned long pfn, int prot, >> + bool do_accounting) >> +{ >> + put_pfn(pfn, prot); >> + >> + if (do_accounting) >> + vfio_lock_acct(addr_space->mm, -1); > > Can't we batch this like we do elsewhere? Intel folks, AIUI you intend > to pin all VM memory through this side channel, have you tested the > scalability and performance of this with larger VMs? Our vfio_pfn > data structure alone is 40 bytes per pinned page, which means for > each 1GB of VM memory, we have 10MBs worth of struct vfio_pfn! > Additionally, unmapping each 1GB of VM memory will result in 256k > separate vfio_lock_acct() callbacks. I'm concerned that we're not > being efficient enough in either space or time. Hi Alex, Sorry for being confusing, Intel vGPU actually doesn't necessarily need to pin all guest memory. A vGPU has its page table (GTT), whose access is trapped. Whenever guest driver wants to specify a page for DMA, it writes the GTT entry - thereby we could know the event and pin that page only. Performance data will be shared once available. Thanks :) -- Thanks, Jike -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html