On 09/03/17 17:47, David Gibson wrote: > On Wed, Mar 08, 2017 at 02:00:11PM +1100, Alexey Kardashevskiy wrote: >> This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT >> and H_STUFF_TCE requests targeted an IOMMU TCE table used for VFIO >> without passing them to user space which saves time on switching >> to user space and back. >> >> This adds H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE handlers to KVM. >> KVM tries to handle a TCE request in the real mode, if failed >> it passes the request to the virtual mode to complete the operation. >> If it a virtual mode handler fails, the request is passed to >> the user space; this is not expected to happen though. >> >> To avoid dealing with page use counters (which is tricky in real mode), >> this only accelerates SPAPR TCE IOMMU v2 clients which are required >> to pre-register the userspace memory. The very first TCE request will >> be handled in the VFIO SPAPR TCE driver anyway as the userspace view >> of the TCE table (iommu_table::it_userspace) is not allocated till >> the very first mapping happens and we cannot call vmalloc in real mode. >> >> If we fail to update a hardware IOMMU table unexpected reason, we just >> clear it and move on as there is nothing really we can do about it - >> for example, if we hot plug a VFIO device to a guest, existing TCE tables >> will be mirrored automatically to the hardware and there is no interface >> to report to the guest about possible failures. >> >> This adds new attribute - KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE - to >> the VFIO KVM device. It takes a VFIO group fd and SPAPR TCE table fd >> and associates a physical IOMMU table with the SPAPR TCE table (which >> is a guest view of the hardware IOMMU table). The iommu_table object >> is cached and referenced so we do not have to look up for it in real mode. >> >> This does not implement the UNSET counterpart as there is no use for it - >> once the acceleration is enabled, the existing userspace won't >> disable it unless a VFIO container is destroyed; this adds necessary >> cleanup to the KVM_DEV_VFIO_GROUP_DEL handler. >> >> As this creates a descriptor per IOMMU table-LIOBN couple (called >> kvmppc_spapr_tce_iommu_table), it is possible to have several >> descriptors with the same iommu_table (hardware IOMMU table) attached >> to the same LIOBN; we do not remove duplicates though as >> iommu_table_ops::exchange not just update a TCE entry (which is >> shared among IOMMU groups) but also invalidates the TCE cache >> (one per IOMMU group). >> >> This advertises the new KVM_CAP_SPAPR_TCE_VFIO capability to the user >> space. >> >> This finally makes use of vfio_external_user_iommu_id() which was >> introduced quite some time ago and was considered for removal. >> >> Tests show that this patch increases transmission speed from 220MB/s >> to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card). >> >> Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > > There are still a few oddities in the error paths. > > [snip] >> +static long kvmppc_tce_iommu_unmap(struct kvm *kvm, >> + struct iommu_table *tbl, unsigned long entry) >> +{ >> + enum dma_data_direction dir = DMA_NONE; >> + unsigned long hpa = 0; >> + long ret; >> + >> + if (iommu_tce_xchg(tbl, entry, &hpa, &dir)) >> + return H_HARDWARE; > > IIR previous discussions properly, an xchg() failure should be a > WARN_ON(). Well, it could, I'll add. It just feels sometime that having both WARN_ON and H_HARDWARE is kind of redundant. > [snip] >> +static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, >> + struct iommu_table *tbl, unsigned long entry) >> +{ >> + struct mm_iommu_table_group_mem_t *mem = NULL; >> + const unsigned long pgsize = 1ULL << tbl->it_page_shift; >> + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); >> + >> + if (WARN_ON_ONCE_RM(!pua)) >> + return H_HARDWARE; >> + >> + pua = (void *) vmalloc_to_phys(pua); >> + if (!pua) >> + return H_TOO_HARD; > > Here a vmalloc translation failure is treated as an H_TOO_HARD.. Ah, here is an oddity, here should be WARN_ON + H_HARDWARE and the if(!pua) check should not do WARN_ON and return H_TOO_HARD, I did not hit this because the hardware is empty when VFIO starts using it so there is nothing to unmap initial H_STUFF_TCE happens. > >> + >> + mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); >> + if (!mem) >> + return H_TOO_HARD; >> + >> + mm_iommu_mapped_dec(mem); >> + >> + *pua = 0; >> + >> + return H_SUCCESS; >> +} >> + >> +static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, >> + struct iommu_table *tbl, unsigned long entry) >> +{ >> + enum dma_data_direction dir = DMA_NONE; >> + unsigned long hpa = 0; >> + long ret; >> + >> + if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) >> + return H_HARDWARE; > > Another xchg failure where I'd expect a WARN. The realmode's iommu_tce_xchg_rm() does SetPageDirty() if realmode_pfn_to_page() succeeded but realmode_pfn_to_page() may fail if the requested page struct is split between 2 system pages in real addressspace but not in virtual address space where page structs are mapped to 0xfxxx.xxxx.xxxx.xxxx. > >> + >> + if (dir == DMA_NONE) >> + return H_SUCCESS; >> + >> + ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); >> + if (ret) >> + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); >> + >> + return ret; >> +} >> + >> +static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, >> + unsigned long entry, unsigned long ua, >> + enum dma_data_direction dir) >> +{ >> + long ret; >> + unsigned long hpa = 0; >> + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); >> + struct mm_iommu_table_group_mem_t *mem; >> + >> + if (!pua) >> + /* it_userspace allocation might be delayed */ >> + return H_TOO_HARD; >> + >> + mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); >> + if (!mem) >> + return H_TOO_HARD; >> + >> + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) >> + return H_HARDWARE; >> + >> + pua = (void *) vmalloc_to_phys(pua); >> + if (WARN_ON_ONCE_RM(!pua)) > > .. but here a more or less identical vmalloc translation failure is > treated as a WARN. Is it TOO_HARD, or WARN? This one is correct. > >> + return H_HARDWARE; >> + >> + if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) >> + return H_CLOSED; >> + >> + ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); >> + if (ret) { >> + mm_iommu_mapped_dec(mem); >> + return H_TOO_HARD; > > And another xchg failure that I'd expect to be a warn Here it is the realmode's iommu_tce_xchg again. > >> + } >> + >> + if (dir != DMA_NONE) >> + kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); >> + >> + *pua = ua; >> + >> + return 0; >> +} >> + >> long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, >> unsigned long ioba, unsigned long tce) >> { >> struct kvmppc_spapr_tce_table *stt; >> long ret; >> + struct kvmppc_spapr_tce_iommu_table *stit; >> + unsigned long entry, ua = 0; >> + enum dma_data_direction dir; >> >> /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ >> /* liobn, ioba, tce); */ >> @@ -182,7 +304,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, >> if (ret != H_SUCCESS) >> return ret; >> >> - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); >> + dir = iommu_tce_direction(tce); >> + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, >> + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) >> + return H_PARAMETER; >> + >> + entry = ioba >> stt->page_shift; >> + >> + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { >> + if (dir == DMA_NONE) >> + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, >> + stit->tbl, entry); >> + else >> + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, >> + stit->tbl, entry, ua, dir); >> + >> + if (ret == H_SUCCESS) >> + continue; >> + >> + if (ret == H_TOO_HARD) >> + return ret; >> + >> + WARN_ON_ONCE_RM(1); >> + kvmppc_rm_clear_tce(stit->tbl, entry); >> + } >> + >> + kvmppc_tce_put(stt, entry, tce); >> >> return H_SUCCESS; >> } >> @@ -223,6 +370,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >> unsigned long tces, entry, ua = 0; >> unsigned long *rmap = NULL; >> bool prereg = false; >> + struct kvmppc_spapr_tce_iommu_table *stit; >> >> stt = kvmppc_find_table(vcpu->kvm, liobn); >> if (!stt) >> @@ -293,6 +441,27 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, >> if (ret != H_SUCCESS) >> goto unlock_exit; >> >> + ua = 0; >> + if (kvmppc_gpa_to_ua(vcpu->kvm, >> + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), >> + &ua, NULL)) >> + return H_PARAMETER; >> + >> + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { >> + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, >> + stit->tbl, entry + i, ua, >> + iommu_tce_direction(tce)); >> + >> + if (ret == H_SUCCESS) >> + continue; >> + >> + if (ret == H_TOO_HARD) >> + goto unlock_exit; >> + >> + WARN_ON_ONCE_RM(1); >> + kvmppc_rm_clear_tce(stit->tbl, entry); >> + } >> + >> kvmppc_tce_put(stt, entry + i, tce); >> } >> >> @@ -309,6 +478,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, >> { >> struct kvmppc_spapr_tce_table *stt; >> long i, ret; >> + struct kvmppc_spapr_tce_iommu_table *stit; >> >> stt = kvmppc_find_table(vcpu->kvm, liobn); >> if (!stt) >> @@ -322,6 +492,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, >> if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) >> return H_PARAMETER; >> >> + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { >> + unsigned long entry = ioba >> stit->tbl->it_page_shift; >> + >> + for (i = 0; i < npages; ++i) { >> + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, >> + stit->tbl, entry + i); >> + >> + if (ret == H_SUCCESS) >> + continue; >> + >> + if (ret == H_TOO_HARD) >> + return ret; >> + >> + WARN_ON_ONCE_RM(1); >> + kvmppc_rm_clear_tce(stit->tbl, entry); >> + } >> + } >> + >> for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) >> kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); >> >> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c >> index 95c91a9de351..62bdd6c48107 100644 >> --- a/arch/powerpc/kvm/powerpc.c >> +++ b/arch/powerpc/kvm/powerpc.c >> @@ -538,6 +538,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) >> #ifdef CONFIG_PPC_BOOK3S_64 >> case KVM_CAP_SPAPR_TCE: >> case KVM_CAP_SPAPR_TCE_64: >> + /* fallthrough */ >> + case KVM_CAP_SPAPR_TCE_VFIO: >> case KVM_CAP_PPC_RTAS: >> case KVM_CAP_PPC_FIXUP_HCALL: >> case KVM_CAP_PPC_ENABLE_HCALL: >> diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c >> index d32f239eb471..2b7dc22265fe 100644 >> --- a/virt/kvm/vfio.c >> +++ b/virt/kvm/vfio.c >> @@ -20,6 +20,10 @@ >> #include <linux/vfio.h> >> #include "vfio.h" >> >> +#ifdef CONFIG_SPAPR_TCE_IOMMU >> +#include <asm/kvm_ppc.h> >> +#endif >> + >> struct kvm_vfio_group { >> struct list_head node; >> struct vfio_group *vfio_group; >> @@ -211,6 +215,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) >> >> mutex_unlock(&kv->lock); >> >> +#ifdef CONFIG_SPAPR_TCE_IOMMU >> + kvm_spapr_tce_release_iommu_group(dev->kvm, vfio_group); >> +#endif >> kvm_vfio_group_set_kvm(vfio_group, NULL); >> >> kvm_vfio_group_put_external_user(vfio_group); >> @@ -218,6 +225,53 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) >> kvm_vfio_update_coherency(dev); >> >> return ret; >> + >> +#ifdef CONFIG_SPAPR_TCE_IOMMU >> + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { >> + struct kvm_vfio_spapr_tce param; >> + unsigned long minsz; >> + struct kvm_vfio *kv = dev->private; >> + struct vfio_group *vfio_group; >> + struct kvm_vfio_group *kvg; >> + struct fd f; >> + >> + minsz = offsetofend(struct kvm_vfio_spapr_tce, tablefd); >> + >> + if (copy_from_user(¶m, (void __user *)arg, minsz)) >> + return -EFAULT; >> + >> + if (param.argsz < minsz || param.flags) >> + return -EINVAL; >> + >> + f = fdget(param.groupfd); >> + if (!f.file) >> + return -EBADF; >> + >> + vfio_group = kvm_vfio_group_get_external_user(f.file); >> + fdput(f); >> + >> + if (IS_ERR(vfio_group)) >> + return PTR_ERR(vfio_group); >> + >> + ret = -ENOENT; >> + >> + mutex_lock(&kv->lock); >> + >> + list_for_each_entry(kvg, &kv->group_list, node) { >> + if (kvg->vfio_group != vfio_group) >> + continue; >> + >> + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, >> + param.tablefd, vfio_group); >> + >> + break; >> + } >> + >> + mutex_unlock(&kv->lock); >> + >> + return ret; >> + } >> +#endif /* CONFIG_SPAPR_TCE_IOMMU */ >> } >> >> return -ENXIO; >> @@ -242,6 +296,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, >> switch (attr->attr) { >> case KVM_DEV_VFIO_GROUP_ADD: >> case KVM_DEV_VFIO_GROUP_DEL: >> +#ifdef CONFIG_SPAPR_TCE_IOMMU >> + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: >> +#endif >> return 0; >> } >> >> @@ -257,6 +314,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev) >> struct kvm_vfio_group *kvg, *tmp; >> >> list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { >> +#ifdef CONFIG_SPAPR_TCE_IOMMU >> + kvm_spapr_tce_release_iommu_group(dev->kvm, kvg->vfio_group); >> +#endif >> kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); >> kvm_vfio_group_put_external_user(kvg->vfio_group); >> list_del(&kvg->node); > -- Alexey
Attachment:
signature.asc
Description: OpenPGP digital signature