On 01/03/2019 12:38, Alexey Kardashevskiy wrote: > We already allocate hardware TCE tables in multiple levels and skip > intermediate levels when we can, now it is a turn of the KVM TCE tables. > Thankfully these are allocated already in 2 levels. > > This moves the table's last level allocation from the creating helper to > kvmppc_tce_put() and kvm_spapr_tce_fault(). > > This adds kvmppc_rm_ioba_validate() to do an additional test if > the consequent kvmppc_tce_put() needs a page which has not been allocated; > if this is the case, we bail out to virtual mode handlers. > > Signed-off-by: Alexey Kardashevskiy <aik@xxxxxxxxx> > --- > Changes: > v2: > * added kvm mutex around alloc_page to prevent races; in both place we > test the pointer, if NULL, then take a lock and check again so on a fast > path we do not take a lock at all > > > --- > For NVLink2 passthrough guests with 128TiB DMA windows and very fragmented > system RAM the difference is gigabytes of RAM. > --- > arch/powerpc/kvm/book3s_64_vio.c | 29 ++++++------ > arch/powerpc/kvm/book3s_64_vio_hv.c | 69 ++++++++++++++++++++++++++--- > 2 files changed, 79 insertions(+), 19 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c > index f02b049..7eed8c9 100644 > --- a/arch/powerpc/kvm/book3s_64_vio.c > +++ b/arch/powerpc/kvm/book3s_64_vio.c > @@ -228,7 +228,8 @@ static void release_spapr_tce_table(struct rcu_head *head) > unsigned long i, npages = kvmppc_tce_pages(stt->size); > > for (i = 0; i < npages; i++) > - __free_page(stt->pages[i]); > + if (stt->pages[i]) > + __free_page(stt->pages[i]); > > kfree(stt); > } > @@ -242,6 +243,20 @@ static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf) > return VM_FAULT_SIGBUS; > > page = stt->pages[vmf->pgoff]; > + if (!page) { > + mutex_lock(&stt->kvm->lock); > + page = stt->pages[vmf->pgoff]; > + if (!page) { > + page = alloc_page(GFP_KERNEL | __GFP_ZERO); > + if (!page) { > + mutex_unlock(&stt->kvm->lock); > + return VM_FAULT_OOM; > + } > + stt->pages[vmf->pgoff] = page; > + } > + mutex_unlock(&stt->kvm->lock); > + } > + > get_page(page); > vmf->page = page; > return 0; > @@ -296,7 +311,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > struct kvmppc_spapr_tce_table *siter; > unsigned long npages, size = args->size; > int ret = -ENOMEM; > - int i; > > if (!args->size || args->page_shift < 12 || args->page_shift > 34 || > (args->offset + args->size > (ULLONG_MAX >> args->page_shift))) > @@ -320,12 +334,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > stt->kvm = kvm; > INIT_LIST_HEAD_RCU(&stt->iommu_tables); > > - for (i = 0; i < npages; i++) { > - stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); > - if (!stt->pages[i]) > - goto fail; > - } > - > mutex_lock(&kvm->lock); > > /* Check this LIOBN hasn't been previously allocated */ > @@ -352,11 +360,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, > if (ret >= 0) > return ret; > > - fail: > - for (i = 0; i < npages; i++) > - if (stt->pages[i]) > - __free_page(stt->pages[i]); > - > kfree(stt); > fail_acct: > kvmppc_account_memlimit(kvmppc_stt_pages(npages), false); > diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c > index 2206bc7..a0912d5 100644 > --- a/arch/powerpc/kvm/book3s_64_vio_hv.c > +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c > @@ -158,23 +158,76 @@ static u64 *kvmppc_page_address(struct page *page) > return (u64 *) page_address(page); > } > > +/* > + * TCEs pages are allocated in kvmppc_tce_put() which won't be able to do so > + * in real mode. > + * Check if kvmppc_tce_put() can succeed in real mode, i.e. a TCEs page is > + * allocated or not required (when clearing a tce entry). > + */ > +static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt, > + unsigned long ioba, unsigned long npages, bool clearing) > +{ > + unsigned long i, sttpage, sttpages; > + unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages); > + > + if (ret) > + return ret; > + /* > + * clearing==true says kvmppc_tce_put won't be allocating pages > + * for empty tces. > + */ > + if (clearing) > + return H_SUCCESS; > + > + sttpage = ((ioba >> stt->page_shift) - stt->offset) / TCES_PER_PAGE; > + sttpages = (npages + TCES_PER_PAGE - 1) / TCES_PER_PAGE; This is wrong, v3 is coming. -- Alexey