We already allocate hardware TCE tables in multiple levels and skip intermediate levels when we can, now it is a turn of the KVM TCE tables. Thankfully these are allocated already in 2 levels. This moves the table's last level allocation from the creating helper to kvmppc_tce_put() and kvm_spapr_tce_fault(). This adds kvmppc_rm_ioba_validate() to do an additional test if the consequent kvmppc_tce_put() needs a page which has not been allocated; if this is the case, we bail out to virtual mode handlers. Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> --- For NVLink2 passthrough guests with 128TiB DMA windows (when we push GPU RAM above the PCI MMIO window in the guest) and very fragmented system RAM the difference is about 16GiB of RAM before and after this patch. --- arch/powerpc/kvm/book3s_64_vio.c | 21 ++++------ arch/powerpc/kvm/book3s_64_vio_hv.c | 62 ++++++++++++++++++++++++++--- 2 files changed, 64 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index f02b049..281b56b 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head) unsigned long i, npages = kvmppc_tce_pages(stt->size); for (i = 0; i < npages; i++) - __free_page(stt->pages[i]); + if (stt->pages[i]) + __free_page(stt->pages[i]); kfree(stt); } @@ -241,6 +242,12 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) return VM_FAULT_SIGBUS; + if (!stt->pages[vmf->pgoff]) { + stt->pages[vmf->pgoff] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[vmf->pgoff]) + return VM_FAULT_OOM; + } + page = stt->pages[vmf->pgoff]; get_page(page); vmf->page = page; @@ -296,7 +303,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvmppc_spapr_tce_table *siter; unsigned long npages, size = args->size; int ret = -ENOMEM; - int i; if (!args->size || args->page_shift < 12 || args->page_shift > 34 || (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) @@ -320,12 +326,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, stt->kvm = kvm; INIT_LIST_HEAD_RCU(&stt->iommu_tables); - for (i = 0; i < npages; i++) { - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!stt->pages[i]) - goto fail; - } - mutex_lock(&kvm->lock); /* Check this LIOBN hasn't been previously allocated */ @@ -352,11 +352,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, if (ret >= 0) return ret; - fail: - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); - kfree(stt); fail_acct: kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 2206bc7..fbb920d 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -158,23 +158,69 @@ static u64 *kvmppc_page_address(struct page *page) return (u64 *) page_address(page); } +/* + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so + * in real mode. + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is + * allocated or not required (when clearing a tce entry). + */ +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, + unsigned long ioba, unsigned long npages, bool clearing) +{ + unsigned long i, sttpage, sttpages; + unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); + + if (ret) + return ret; + /* + * clearing==true says kvmppc_tce_put won't be allocating pages + * for empty tces. + */ + if (clearing) + return H_SUCCESS; + + sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE; + sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE; + for (i = sttpage; i < sttpage + sttpages; ++i) + if (!stt->pages[i]) + return H_TOO_HARD; + + return H_SUCCESS; +} + /* * Handles TCE requests for emulated devices. * Puts guest TCE values to the table and expects user space to convert them. * Called in both real and virtual modes. * Cannot fail so kvmppc_tce_validate must be called before it. * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM + * WARNING: This will be called in real-mode on HV HPT KVM and virtual + * mode on PR KVM or HV radix KVM */ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, unsigned long idx, unsigned long tce) { struct page *page; u64 *tbl; + unsigned long sttpage; idx -= stt->offset; - page = stt->pages[idx / TCES_PER_PAGE]; + sttpage = idx / TCES_PER_PAGE; + page = stt->pages[sttpage]; + + if (!page) { + /* We allow any TCE, not just with read|write permissions */ + if (!tce) + return; + /* + * We must not end up here in real mode, + * kvmppc_rm_ioba_validate() takes care of this. + */ + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (WARN_ON_ONCE(!page)) + return; + stt->pages[sttpage] = page; + } tbl = kvmppc_page_address(page); tbl[idx % TCES_PER_PAGE] = tce; @@ -381,7 +427,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (!stt) return H_TOO_HARD; - ret = kvmppc_ioba_validate(stt, ioba, 1); + ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); if (ret != H_SUCCESS) return ret; @@ -480,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (tce_list & (SZ_4K - 1)) return H_PARAMETER; - ret = kvmppc_ioba_validate(stt, ioba, npages); + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); if (ret != H_SUCCESS) return ret; @@ -583,7 +629,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, if (!stt) return H_TOO_HARD; - ret = kvmppc_ioba_validate(stt, ioba, npages); + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); if (ret != H_SUCCESS) return ret; @@ -635,6 +681,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, idx = (ioba >> stt->page_shift) - stt->offset; page = stt->pages[idx / TCES_PER_PAGE]; + if (!page) { + vcpu->arch.regs.gpr[4] = 0; + return H_SUCCESS; + } tbl = (u64 *)page_address(page); vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; -- 2.17.1