On Wed, Feb 27, 2019 at 07:42:35PM +1100, Alexey Kardashevskiy wrote: > We already allocate hardware TCE tables in multiple levels and skip > intermediate levels when we can, now it is a turn of the KVM TCE tables. > Thankfully these are allocated already in 2 levels. > > This moves the table's last level allocation from the creating helper to > kvmppc_tce_put() and kvm_spapr_tce_fault(). > > This adds kvmppc_rm_ioba_validate() to do an additional test if > the consequent kvmppc_tce_put() needs a page which has not been allocated; > if this is the case, we bail out to virtual mode handlers. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > > For NVLink2 passthrough guests with 128TiB DMA windows (when we push GPU RAM > above the PCI MMIO window in the guest) and very fragmented system RAM > the difference is about 16GiB of RAM before and after this patch. > > --- > arch/powerpc/kvm/book3s_64_vio.c | 21 ++++------ > arch/powerpc/kvm/book3s_64_vio_hv.c | 62 ++++++++++++++++++++++++++--- > 2 files changed, 64 insertions(+), 19 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c > index f02b049..281b56b 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head) > unsigned long i, npages = kvmppc_tce_pages(stt->size); > > for (i = 0; i < npages; i++) > - __free_page(stt->pages[i]); > + if (stt->pages[i]) > + __free_page(stt->pages[i]); > > kfree(stt); > } > @@ -241,6 +242,12 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) > if (vmf->pgoff >= kvmppc_tce_pages(stt->size)) > return VM_FAULT_SIGBUS; > > + if (!stt->pages[vmf->pgoff]) { > + stt->pages[vmf->pgoff] = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!stt->pages[vmf->pgoff]) > + return VM_FAULT_OOM; > + } > + > page = stt->pages[vmf->pgoff]; > get_page(page); > vmf->page = page; > @@ -296,7 +303,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > struct kvmppc_spapr_tce_table *siter; > unsigned long npages, size = args->size; > int ret = -ENOMEM; > - int i; > > if (!args->size || args->page_shift < 12 || args->page_shift > 34 || > (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) > @@ -320,12 +326,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > stt->kvm = kvm; > INIT_LIST_HEAD_RCU(&stt->iommu_tables); > > - for (i = 0; i < npages; i++) { > - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); > - if (!stt->pages[i]) > - goto fail; > - } > - > mutex_lock(&kvm->lock); > > /* Check this LIOBN hasn't been previously allocated */ > @@ -352,11 +352,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > if (ret >= 0) > return ret; > > - fail: > - for (i = 0; i < npages; i++) > - if (stt->pages[i]) > - __free_page(stt->pages[i]); > - > kfree(stt); > fail_acct: > kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > index 2206bc7..fbb920d 100644 > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > @@ -158,23 +158,69 @@ static u64 *kvmppc_page_address(struct page *page) > return (u64 *) page_address(page); > } > > +/* > + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so > + * in real mode. > + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is > + * allocated or not required (when clearing a tce entry). > + */ > +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, > + unsigned long ioba, unsigned long npages, bool clearing) > +{ > + unsigned long i, sttpage, sttpages; > + unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); > + > + if (ret) > + return ret; > + /* > + * clearing==true says kvmppc_tce_put won't be allocating pages > + * for empty tces. > + */ > + if (clearing) > + return H_SUCCESS; > + > + sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE; > + sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE; > + for (i = sttpage; i < sttpage + sttpages; ++i) > + if (!stt->pages[i]) > + return H_TOO_HARD; > + > + return H_SUCCESS; > +} > + > /* > * Handles TCE requests for emulated devices. > * Puts guest TCE values to the table and expects user space to convert them. > * Called in both real and virtual modes. > * Cannot fail so kvmppc_tce_validate must be called before it. > * > - * WARNING: This will be called in real-mode on HV KVM and virtual > - * mode on PR KVM > + * WARNING: This will be called in real-mode on HV HPT KVM and virtual > + * mode on PR KVM or HV radix KVM > */ > void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, > unsigned long idx, unsigned long tce) > { > struct page *page; > u64 *tbl; > + unsigned long sttpage; > > idx -= stt->offset; > - page = stt->pages[idx / TCES_PER_PAGE]; > + sttpage = idx / TCES_PER_PAGE; > + page = stt->pages[sttpage]; > + > + if (!page) { > + /* We allow any TCE, not just with read|write permissions */ > + if (!tce) > + return; > + /* > + * We must not end up here in real mode, > + * kvmppc_rm_ioba_validate() takes care of this. > + */ > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (WARN_ON_ONCE(!page)) > + return; > + stt->pages[sttpage] = page; > + } > tbl = kvmppc_page_address(page); > > tbl[idx % TCES_PER_PAGE] = tce; > @@ -381,7 +427,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > if (!stt) > return H_TOO_HARD; > > - ret = kvmppc_ioba_validate(stt, ioba, 1); > + ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); > if (ret != H_SUCCESS) > return ret; > > @@ -480,7 +526,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > if (tce_list & (SZ_4K - 1)) > return H_PARAMETER; > > - ret = kvmppc_ioba_validate(stt, ioba, npages); > + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); > if (ret != H_SUCCESS) > return ret; > > @@ -583,7 +629,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, > if (!stt) > return H_TOO_HARD; > > - ret = kvmppc_ioba_validate(stt, ioba, npages); > + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); > if (ret != H_SUCCESS) > return ret; > > @@ -635,6 +681,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > > idx = (ioba >> stt->page_shift) - stt->offset; > page = stt->pages[idx / TCES_PER_PAGE]; > + if (!page) { > + vcpu->arch.regs.gpr[4] = 0; > + return H_SUCCESS; > + } > tbl = (u64 *)page_address(page); > > vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature