On Fri, Mar 01, 2019 at 03:34:36PM +1100, Alexey Kardashevskiy wrote: > We already allocate hardware TCE tables in multiple levels and skip > intermediate levels when we can, now it is a turn of the KVM TCE tables. > Thankfully these are allocated already in 2 levels. > > This moves the table's last level allocation from the creating helper to > kvmppc_tce_put() and kvm_spapr_tce_fault(). > > This adds kvmppc_rm_ioba_validate() to do an additional test if > the consequent kvmppc_tce_put() needs a page which has not been allocated; > if this is the case, we bail out to virtual mode handlers. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > Changes: > v3: > * fixed alignments in kvmppc_rm_ioba_validate > > v2: > * added kvm mutex around alloc_page to prevent races; in both place we > test the pointer, if NULL, then take a lock and check again so on a fast > path we do not take a lock at all > > > --- > For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented > system RAM the difference is gigabytes of RAM. > --- > arch/powerpc/kvm/book3s_64_vio.c | 29 ++++++------ > arch/powerpc/kvm/book3s_64_vio_hv.c | 71 ++++++++++++++++++++++++++--- > 2 files changed, 81 insertions(+), 19 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c > index f02b04973710..7eed8c90ea3d 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head) > unsigned long i, npages = kvmppc_tce_pages(stt->size); > > for (i = 0; i < npages; i++) > - __free_page(stt->pages[i]); > + if (stt->pages[i]) > + __free_page(stt->pages[i]); > > kfree(stt); > } > @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) > return VM_FAULT_SIGBUS; > > page = stt->pages[vmf->pgoff]; > + if (!page) { > + mutex_lock(&stt->kvm->lock); > + page = stt->pages[vmf->pgoff]; > + if (!page) { > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!page) { > + mutex_unlock(&stt->kvm->lock); > + return VM_FAULT_OOM; > + } > + stt->pages[vmf->pgoff] = page; > + } > + mutex_unlock(&stt->kvm->lock); > + } > + > get_page(page); > vmf->page = page; > return 0; > @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > struct kvmppc_spapr_tce_table *siter; > unsigned long npages, size = args->size; > int ret = -ENOMEM; > - int i; > > if (!args->size || args->page_shift < 12 || args->page_shift > 34 || > (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) > @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > stt->kvm = kvm; > INIT_LIST_HEAD_RCU(&stt->iommu_tables); > > - for (i = 0; i < npages; i++) { > - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); > - if (!stt->pages[i]) > - goto fail; > - } > - > mutex_lock(&kvm->lock); > > /* Check this LIOBN hasn't been previously allocated */ > @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > if (ret >= 0) > return ret; > > - fail: > - for (i = 0; i < npages; i++) > - if (stt->pages[i]) > - __free_page(stt->pages[i]); > - > kfree(stt); > fail_acct: > kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > index 2206bc729b9a..1cd9373f8bdc 100644 > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > @@ -158,23 +158,78 @@ static u64 *kvmppc_page_address(struct page *page) > return (u64 *) page_address(page); > } > > +/* > + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so > + * in real mode. > + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is > + * allocated or not required (when clearing a tce entry). > + */ > +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, > + unsigned long ioba, unsigned long npages, bool clearing) > +{ > + unsigned long i, idx, sttpage, sttpages; > + unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); > + > + if (ret) > + return ret; > + /* > + * clearing==true says kvmppc_tce_put won't be allocating pages > + * for empty tces. > + */ > + if (clearing) > + return H_SUCCESS; > + > + idx = (ioba >> stt->page_shift) - stt->offset; > + sttpage = idx / TCES_PER_PAGE; > + sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) / > + TCES_PER_PAGE; > + for (i = sttpage; i < sttpage + sttpages; ++i) > + if (!stt->pages[i]) > + return H_TOO_HARD; > + > + return H_SUCCESS; > +} > + > /* > * Handles TCE requests for emulated devices. > * Puts guest TCE values to the table and expects user space to convert them. > * Called in both real and virtual modes. > * Cannot fail so kvmppc_tce_validate must be called before it. > * > - * WARNING: This will be called in real-mode on HV KVM and virtual > - * mode on PR KVM > + * WARNING: This will be called in real-mode on HV HPT KVM and virtual > + * mode on PR KVM or HV radix KVM > */ > void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, > unsigned long idx, unsigned long tce) > { > struct page *page; > u64 *tbl; > + unsigned long sttpage; > > idx -= stt->offset; > - page = stt->pages[idx / TCES_PER_PAGE]; > + sttpage = idx / TCES_PER_PAGE; > + page = stt->pages[sttpage]; > + > + if (!page) { > + /* We allow any TCE, not just with read|write permissions */ > + if (!tce) > + return; > + /* > + * We must not end up here in real mode, > + * kvmppc_rm_ioba_validate() takes care of this. > + */ > + mutex_lock(&stt->kvm->lock); > + page = stt->pages[sttpage]; > + if (!page) { > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (WARN_ON_ONCE(!page)) { > + mutex_unlock(&stt->kvm->lock); > + return; > + } > + stt->pages[sttpage] = page; > + } > + mutex_unlock(&stt->kvm->lock); > + } > tbl = kvmppc_page_address(page); > > tbl[idx % TCES_PER_PAGE] = tce; > @@ -381,7 +436,7 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > if (!stt) > return H_TOO_HARD; > > - ret = kvmppc_ioba_validate(stt, ioba, 1); > + ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0); > if (ret != H_SUCCESS) > return ret; > > @@ -480,7 +535,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, > if (tce_list & (SZ_4K - 1)) > return H_PARAMETER; > > - ret = kvmppc_ioba_validate(stt, ioba, npages); > + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false); > if (ret != H_SUCCESS) > return ret; > > @@ -583,7 +638,7 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, > if (!stt) > return H_TOO_HARD; > > - ret = kvmppc_ioba_validate(stt, ioba, npages); > + ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0); > if (ret != H_SUCCESS) > return ret; > > @@ -635,6 +690,10 @@ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, > > idx = (ioba >> stt->page_shift) - stt->offset; > page = stt->pages[idx / TCES_PER_PAGE]; > + if (!page) { > + vcpu->arch.regs.gpr[4] = 0; > + return H_SUCCESS; > + } > tbl = (u64 *)page_address(page); > > vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE]; -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature