For now, one has to globally enable vm.alloc_pgste in order to run KVM guests. This results in any process getting 4k page tables instead of only 2k page tables. As the decision about which page table type to use has to be made at fork time, there isn't much we can do to avoid this global system setting. However, by allowing mixed page table types for one process, we can simply turn on allocation of 4k page tables with pgstes at one point and rely on the caller to make sure that everything getting mapped into the gmap will be based on 4k page tables allocated/mmaped after the switch to 4k page table allocation. If we detect some incompatible page table, we will handle this as an ordinary fault, indicating -EFAULT. Background: QEMU will in general create the KVM VM before mmap-ing memory that will get part of the guest address space. Every mmap will create new page tables. To not break some legacy/other user space doing this in a different order, this will have to be enabled explicitly by the caller. So the vm.alloc_pgste option has to stay. Signed-off-by: David Hildenbrand <david@xxxxxxxxxx> --- arch/s390/include/asm/pgtable.h | 15 +++++++++-- arch/s390/kvm/kvm-s390.c | 4 +-- arch/s390/mm/gmap.c | 11 +++++--- arch/s390/mm/pgalloc.c | 14 +++++----- arch/s390/mm/pgtable.c | 59 ++++++++++++++++++++++++++++++++++------- 5 files changed, 81 insertions(+), 22 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 3effb26..0fb6d29 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1089,6 +1089,17 @@ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep); int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, unsigned long *oldpte, unsigned long *oldpgste); +static inline int pgtable_has_pgste(struct mm_struct *mm, unsigned long addr) +{ + struct page *page; + + if (!mm_has_pgste(mm)) + return 0; + + page = pfn_to_page(addr >> PAGE_SHIFT); + return atomic_read(&page->_mapcount) & 0x4U; +} + /* * Certain architectures need to do special things when PTEs * within a page table are directly modified. Thus, the following @@ -1101,7 +1112,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_val(entry) &= ~_PAGE_NOEXEC; if (pte_present(entry)) pte_val(entry) &= ~_PAGE_UNUSED; - if (mm_has_pgste(mm)) + if (pgtable_has_pgste(mm, __pa(ptep))) ptep_set_pte_at(mm, addr, ptep, entry); else *ptep = entry; @@ -1541,7 +1552,7 @@ static inline swp_entry_t __swp_entry(unsigned long type, unsigned long offset) extern int vmem_add_mapping(unsigned long start, unsigned long size); extern int vmem_remove_mapping(unsigned long start, unsigned long size); -extern int s390_enable_sie(void); +extern int s390_enable_sie(bool mixed_pgtables); extern int s390_enable_skey(void); extern void s390_reset_cmma(struct mm_struct *mm); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 4ef3035..89684bb 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -354,7 +354,7 @@ long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { if (ioctl == KVM_S390_ENABLE_SIE) - return s390_enable_sie(); + return s390_enable_sie(false); return -EINVAL; } @@ -1808,7 +1808,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) goto out_err; #endif - rc = s390_enable_sie(); + rc = s390_enable_sie(false); if (rc) goto out_err; diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 4fb3d3c..ff24ada 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -586,6 +586,9 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) /* large pmds cannot yet be handled */ if (pmd_large(*pmd)) return -EFAULT; + /* only page tables with pgstes can be linked into a gmap */ + if (!pgtable_has_pgste(mm, pmd_pfn(*pmd) << PAGE_SHIFT)) + return -EFAULT; /* Link gmap segment table entry location to page table. */ rc = radix_tree_preload(GFP_KERNEL); if (rc) @@ -2123,17 +2126,19 @@ static inline void thp_split_mm(struct mm_struct *mm) /* * switch on pgstes for its userspace process (for kvm) */ -int s390_enable_sie(void) +int s390_enable_sie(bool mixed_pgtables) { struct mm_struct *mm = current->mm; /* Do we have pgstes? if yes, we are done */ if (mm_has_pgste(mm)) return 0; - /* Fail if the page tables are 2K */ - if (!mm_alloc_pgste(mm)) + /* Fail if the page tables are 2K and mixed pgtables are disabled */ + if (!mixed_pgtables && !mm_alloc_pgste(mm)) return -EINVAL; down_write(&mm->mmap_sem); + /* allocate page tables with pgste from now on if not already done */ + mm->context.alloc_pgste = 1; mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 18918e3..2790473 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -218,7 +218,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); + atomic_set(&page->_mapcount, 4); clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); } else { @@ -238,7 +238,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) unsigned int bit, mask; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (!mm_alloc_pgste(mm)) { + if (!pgtable_has_pgste(mm, __pa(table))) { /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.pgtable_lock); @@ -266,9 +266,9 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, mm = tlb->mm; page = pfn_to_page(__pa(table) >> PAGE_SHIFT); - if (mm_alloc_pgste(mm)) { + if (pgtable_has_pgste(mm, __pa(table))) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); + table = (unsigned long *) (__pa(table) | 4); tlb_remove_table(tlb, table); return; } @@ -286,7 +286,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, static void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 3; + unsigned int mask = (unsigned long) _table & 7; void *table = (void *)((unsigned long) _table ^ mask); struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); @@ -299,11 +299,13 @@ static void __tlb_remove_table(void *_table) if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) break; /* fallthrough */ - case 3: /* 4K page table with pgstes */ + case 4: /* 4K page table with pgstes */ pgtable_page_dtor(page); atomic_set(&page->_mapcount, -1); __free_page(page); break; + default: + WARN_ONCE(true, "Unknown table type: %x", mask); } } diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index d4d409b..b22c2b6 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -196,7 +196,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm, { pgste_t pgste = __pgste(0); - if (mm_has_pgste(mm)) { + if (pgtable_has_pgste(mm, __pa(ptep))) { pgste = pgste_get_lock(ptep); pgste = pgste_pte_notify(mm, addr, ptep, pgste); } @@ -207,7 +207,7 @@ static inline pte_t ptep_xchg_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pgste_t pgste, pte_t old, pte_t new) { - if (mm_has_pgste(mm)) { + if (pgtable_has_pgste(mm, __pa(ptep))) { if (pte_val(old) & _PAGE_INVALID) pgste_set_key(ptep, pgste, new, mm); if (pte_val(new) & _PAGE_INVALID) { @@ -263,7 +263,7 @@ pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, preempt_disable(); pgste = ptep_xchg_start(mm, addr, ptep); old = ptep_flush_lazy(mm, addr, ptep); - if (mm_has_pgste(mm)) { + if (pgtable_has_pgste(mm, __pa(ptep))) { pgste = pgste_update_all(old, pgste, mm); pgste_set(ptep, pgste); } @@ -278,7 +278,7 @@ void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, if (!MACHINE_HAS_NX) pte_val(pte) &= ~_PAGE_NOEXEC; - if (mm_has_pgste(mm)) { + if (pgtable_has_pgste(mm, __pa(ptep))) { pgste = pgste_get(ptep); pgste_set_key(ptep, pgste, pte, mm); pgste = pgste_set_pte(ptep, pgste, pte); @@ -445,7 +445,7 @@ void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr, { pgste_t pgste; - /* the mm_has_pgste() check is done in set_pte_at() */ + /* the pgtable_has_pgste() check is done in set_pte_at() */ preempt_disable(); pgste = pgste_get_lock(ptep); pgste_val(pgste) &= ~_PGSTE_GPS_ZERO; @@ -569,6 +569,9 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, pgste_t pgste; pte_t pte; + if (!pgtable_has_pgste(mm, __pa(ptep))) + return; + /* Zap unused and logically-zero pages */ preempt_disable(); pgste = pgste_get_lock(ptep); @@ -588,18 +591,22 @@ void ptep_zap_unused(struct mm_struct *mm, unsigned long addr, void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { + const bool has_pgste = pgtable_has_pgste(mm, __pa(ptep)); unsigned long ptev; pgste_t pgste; /* Clear storage key */ preempt_disable(); - pgste = pgste_get_lock(ptep); - pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | - PGSTE_GR_BIT | PGSTE_GC_BIT); + if (has_pgste) { + pgste = pgste_get_lock(ptep); + pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT | + PGSTE_GR_BIT | PGSTE_GC_BIT); + } ptev = pte_val(*ptep); if (!(ptev & _PAGE_INVALID) && (ptev & _PAGE_WRITE)) page_set_storage_key(ptev & PAGE_MASK, PAGE_DEFAULT_KEY, 1); - pgste_set_unlock(ptep, pgste); + if (has_pgste) + pgste_set_unlock(ptep, pgste); preempt_enable(); } @@ -634,6 +641,10 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr) */ if (pmd_large(*pmd)) return true; + if (!pgtable_has_pgste(mm, __pa(pmd))) { + WARN_ONCE(true, "Guest address on page table without pgste"); + return false; + } ptep = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (unlikely(!ptep)) @@ -670,6 +681,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr, ptep = get_locked_pte(mm, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } new = old = pgste_get_lock(ptep); pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT | @@ -748,6 +764,11 @@ int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr) ptep = get_locked_pte(mm, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } new = old = pgste_get_lock(ptep); /* Reset guest reference bit only */ @@ -780,6 +801,11 @@ int get_guest_storage_key(struct mm_struct *mm, unsigned long addr, ptep = get_locked_pte(mm, addr, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } pgste = pgste_get_lock(ptep); *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56; @@ -820,6 +846,11 @@ int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } pgste = pgste_get_lock(ptep); pgstev = pgste_val(pgste); if (oldpte) @@ -912,6 +943,11 @@ int set_pgste_bits(struct mm_struct *mm, unsigned long hva, ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } new = pgste_get_lock(ptep); pgste_val(new) &= ~bits; @@ -939,6 +975,11 @@ int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) ptep = get_locked_pte(mm, hva, &ptl); if (unlikely(!ptep)) return -EFAULT; + if (!pgtable_has_pgste(mm, __pa(ptep))) { + pte_unmap_unlock(ptep, ptl); + WARN_ONCE(true, "Guest address on page table without pgste"); + return -EFAULT; + } *pgstep = pgste_val(pgste_get(ptep)); pte_unmap_unlock(ptep, ptl); return 0; -- 2.9.3