On Wed, Mar 08, 2017 at 02:00:11PM +1100, Alexey Kardashevskiy wrote: > This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT > and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO > without passing them to user space which saves time on switching > to user space and back. > > This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM. > KVM tries to handle a TCE request in the real mode, if failed > it passes the request to the virtual mode to complete the operation. > If it a virtual mode handler fails, the request is passed to > the user space; this is not expected to happen though. > > To avoid dealing with page use counters (which is tricky in real mode), > this only accelerates SPAPR TCE IOMMU v2 clients which are required > to pre-register the userspace memory. The very first TCE request will > be handled in the VFIO SPAPR TCE driver anyway as the userspace view > of the TCE table (iommu_table::it_userspace) is not allocated till > the very first mapping happens and we cannot call vmalloc in real mode. > > If we fail to update a hardware IOMMU table unexpected reason, we just > clear it and move on as there is nothing really we can do about it - > for example, if we hot plug a VFIO device to a guest, existing TCE tables > will be mirrored automatically to the hardware and there is no interface > to report to the guest about possible failures. > > This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to > the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd > and associates a physical IOMMU table with the SPAPR TCE table (which > is a guest view of the hardware IOMMU table). The iommu_table object > is cached and referenced so we do not have to look up for it in real mode. > > This does not implement the UNSET counterpart as there is no use for it - > once the acceleration is enabled, the existing userspace won't > disable it unless a VFIO container is destroyed; this adds necessary > cleanup to the KVM_DEV_VFIO_GROUP_DEL handler. > > As this creates a descriptor per IOMMU table-LIOBN couple (called > kvmppc_spapr_tce_iommu_table), it is possible to have several > descriptors with the same iommu_table (hardware IOMMU table) attached > to the same LIOBN; we do not remove duplicates though as > iommu_table_ops::exchange not just update a TCE entry (which is > shared among IOMMU groups) but also invalidates the TCE cache > (one per IOMMU group). > > This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user > space. > > This finally makes use of vfio_external_user_iommu_id() which was > introduced quite some time ago and was considered for removal. > > Tests show that this patch increases transmission speed from 220MB/s > to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> There are still a few oddities in the error paths. [snip] > +static long kvmppc_tce_iommu_unmap(struct kvm *kvm, > + struct iommu_table *tbl, unsigned long entry) > +{ > + enum dma_data_direction dir = DMA_NONE; > + unsigned long hpa = 0; > + long ret; > + > + if (iommu_tce_xchg(tbl, entry, &hpa, &dir)) > + return H_HARDWARE; IIR previous discussions properly, an xchg() failure should be a WARN_ON(). [snip] > +static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, > + struct iommu_table *tbl, unsigned long entry) > +{ > + struct mm_iommu_table_group_mem_t *mem = NULL; > + const unsigned long pgsize = 1ULL << tbl->it_page_shift; > + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); > + > + if (WARN_ON_ONCE_RM(!pua)) > + return H_HARDWARE; > + > + pua = (void *) vmalloc_to_phys(pua); > + if (!pua) > + return H_TOO_HARD; Here a vmalloc translation failure is treated as an H_TOO_HARD.. > + > + mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); > + if (!mem) > + return H_TOO_HARD; > + > + mm_iommu_mapped_dec(mem); > + > + *pua = 0; > + > + return H_SUCCESS; > +} > + > +static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, > + struct iommu_table *tbl, unsigned long entry) > +{ > + enum dma_data_direction dir = DMA_NONE; > + unsigned long hpa = 0; > + long ret; > + > + if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) > + return H_HARDWARE; Another xchg failure where I'd expect a WARN. > + > + if (dir == DMA_NONE) > + return H_SUCCESS; > + > + ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); > + if (ret) > + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); > + > + return ret; > +} > + > +static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, > + unsigned long entry, unsigned long ua, > + enum dma_data_direction dir) > +{ > + long ret; > + unsigned long hpa = 0; > + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); > + struct mm_iommu_table_group_mem_t *mem; > + > + if (!pua) > + /* it_userspace allocation might be delayed */ > + return H_TOO_HARD; > + > + mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); > + if (!mem) > + return H_TOO_HARD; > + > + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) > + return H_HARDWARE; > + > + pua = (void *) vmalloc_to_phys(pua); > + if (WARN_ON_ONCE_RM(!pua)) .. but here a more or less identical vmalloc translation failure is treated as a WARN. Is it TOO_HARD, or WARN? > + return H_HARDWARE; > + > + if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) > + return H_CLOSED; > + > + ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); > + if (ret) { > + mm_iommu_mapped_dec(mem); > + return H_TOO_HARD; And another xchg failure that I'd expect to be a warn > + } > + > + if (dir != DMA_NONE) > + kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); > + > + *pua = ua; > + > + return 0; > +} > + > long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > unsigned long ioba, unsigned long tce) > { > struct kvmppc_spapr_tce_table *stt; > long ret; > + struct kvmppc_spapr_tce_iommu_table *stit; > + unsigned long entry, ua = 0; > + enum dma_data_direction dir; > > /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ > /* liobn, ioba, tce); */ > @@ -182,7 +304,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > if (ret != H_SUCCESS) > return ret; > > - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); > + dir = iommu_tce_direction(tce); > + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, > + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) > + return H_PARAMETER; > + > + entry = ioba >> stt->page_shift; > + > + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { > + if (dir == DMA_NONE) > + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, > + stit->tbl, entry); > + else > + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, > + stit->tbl, entry, ua, dir); > + > + if (ret == H_SUCCESS) > + continue; > + > + if (ret == H_TOO_HARD) > + return ret; > + > + WARN_ON_ONCE_RM(1); > + kvmppc_rm_clear_tce(stit->tbl, entry); > + } > + > + kvmppc_tce_put(stt, entry, tce); > > return H_SUCCESS; > } > @@ -223,6 +370,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > unsigned long tces, entry, ua = 0; > unsigned long *rmap = NULL; > bool prereg = false; > + struct kvmppc_spapr_tce_iommu_table *stit; > > stt = kvmppc_find_table(vcpu->kvm, liobn); > if (!stt) > @@ -293,6 +441,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > if (ret != H_SUCCESS) > goto unlock_exit; > > + ua = 0; > + if (kvmppc_gpa_to_ua(vcpu->kvm, > + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), > + &ua, NULL)) > + return H_PARAMETER; > + > + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { > + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, > + stit->tbl, entry + i, ua, > + iommu_tce_direction(tce)); > + > + if (ret == H_SUCCESS) > + continue; > + > + if (ret == H_TOO_HARD) > + goto unlock_exit; > + > + WARN_ON_ONCE_RM(1); > + kvmppc_rm_clear_tce(stit->tbl, entry); > + } > + > kvmppc_tce_put(stt, entry + i, tce); > } > > @@ -309,6 +478,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, > { > struct kvmppc_spapr_tce_table *stt; > long i, ret; > + struct kvmppc_spapr_tce_iommu_table *stit; > > stt = kvmppc_find_table(vcpu->kvm, liobn); > if (!stt) > @@ -322,6 +492,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, > if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) > return H_PARAMETER; > > + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { > + unsigned long entry = ioba >> stit->tbl->it_page_shift; > + > + for (i = 0; i < npages; ++i) { > + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, > + stit->tbl, entry + i); > + > + if (ret == H_SUCCESS) > + continue; > + > + if (ret == H_TOO_HARD) > + return ret; > + > + WARN_ON_ONCE_RM(1); > + kvmppc_rm_clear_tce(stit->tbl, entry); > + } > + } > + > for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) > kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); > > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 95c91a9de351..62bdd6c48107 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -538,6 +538,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) > #ifdef CONFIG_PPC_BOOK3S_64 > case KVM_CAP_SPAPR_TCE: > case KVM_CAP_SPAPR_TCE_64: > + /* fallthrough */ > + case KVM_CAP_SPAPR_TCE_VFIO: > case KVM_CAP_PPC_RTAS: > case KVM_CAP_PPC_FIXUP_HCALL: > case KVM_CAP_PPC_ENABLE_HCALL: > diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c > index d32f239eb471..2b7dc22265fe 100644 > --- a/virt/kvm/vfio.c > +++ b/virt/kvm/vfio.c > @@ -20,6 +20,10 @@ > #include <linux/vfio.h> > #include "vfio.h" > > +#ifdef CONFIG_SPAPR_TCE_IOMMU > +#include <asm/kvm_ppc.h> > +#endif > + > struct kvm_vfio_group { > struct list_head node; > struct vfio_group *vfio_group; > @@ -211,6 +215,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) > > mutex_unlock(&kv->lock); > > +#ifdef CONFIG_SPAPR_TCE_IOMMU > + kvm_spapr_tce_release_iommu_group(dev->kvm, vfio_group); > +#endif > kvm_vfio_group_set_kvm(vfio_group, NULL); > > kvm_vfio_group_put_external_user(vfio_group); > @@ -218,6 +225,53 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) > kvm_vfio_update_coherency(dev); > > return ret; > + > +#ifdef CONFIG_SPAPR_TCE_IOMMU > + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { > + struct kvm_vfio_spapr_tce param; > + unsigned long minsz; > + struct kvm_vfio *kv = dev->private; > + struct vfio_group *vfio_group; > + struct kvm_vfio_group *kvg; > + struct fd f; > + > + minsz = offsetofend(struct kvm_vfio_spapr_tce, tablefd); > + > + if (copy_from_user(¶m, (void __user *)arg, minsz)) > + return -EFAULT; > + > + if (param.argsz < minsz || param.flags) > + return -EINVAL; > + > + f = fdget(param.groupfd); > + if (!f.file) > + return -EBADF; > + > + vfio_group = kvm_vfio_group_get_external_user(f.file); > + fdput(f); > + > + if (IS_ERR(vfio_group)) > + return PTR_ERR(vfio_group); > + > + ret = -ENOENT; > + > + mutex_lock(&kv->lock); > + > + list_for_each_entry(kvg, &kv->group_list, node) { > + if (kvg->vfio_group != vfio_group) > + continue; > + > + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, > + param.tablefd, vfio_group); > + > + break; > + } > + > + mutex_unlock(&kv->lock); > + > + return ret; > + } > +#endif /* CONFIG_SPAPR_TCE_IOMMU */ > } > > return -ENXIO; > @@ -242,6 +296,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, > switch (attr->attr) { > case KVM_DEV_VFIO_GROUP_ADD: > case KVM_DEV_VFIO_GROUP_DEL: > +#ifdef CONFIG_SPAPR_TCE_IOMMU > + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: > +#endif > return 0; > } > > @@ -257,6 +314,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev) > struct kvm_vfio_group *kvg, *tmp; > > list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { > +#ifdef CONFIG_SPAPR_TCE_IOMMU > + kvm_spapr_tce_release_iommu_group(dev->kvm, kvg->vfio_group); > +#endif > kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); > kvm_vfio_group_put_external_user(kvg->vfio_group); > list_del(&kvg->node); -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature