Hi Suzuki, On 06/29/2018 01:15 PM, Suzuki K Poulose wrote: > So far we had a static stage2 page table handling code, based on a > fixed IPA of 40bits. As we prepare for a configurable IPA size per > VM, make our stage2 page table code dynamic, to do the right thing > for a given VM. We ensure the existing condition is always true even > when we lift the limit on the IPA. i.e, > > page table levels in stage1 >= page table levels in stage2 > > Support for the IPA size configuration needs other changes in the way > we configure the EL2 registers (VTTBR and VTCR). So, the IPA is still > fixed to 40bits. The patch also moves the kvm_page_empty() in asm/kvm_mmu.h > to the top, before including the asm/stage2_pgtable.h to avoid a forward > declaration. > > Cc: Marc Zyngier <marc.zyngier@xxxxxxx> > Cc: Christoffer Dall <cdall@xxxxxxxxxx> > Signed-off-by: Suzuki K Poulose <suzuki.poulose@xxxxxxx> > --- > Changes since V2 > - Restrict the stage2 page table to allow reusing the host page table > helpers for now, until we get stage1 independent page table helpers. I would move this up in the commit msg to motivate the fact we enforce the able condition. > --- > arch/arm64/include/asm/kvm_mmu.h | 14 +- > arch/arm64/include/asm/stage2_pgtable-nopmd.h | 42 ------ > arch/arm64/include/asm/stage2_pgtable-nopud.h | 39 ----- > arch/arm64/include/asm/stage2_pgtable.h | 207 +++++++++++++++++++------- > 4 files changed, 159 insertions(+), 143 deletions(-) > delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopmd.h > delete mode 100644 arch/arm64/include/asm/stage2_pgtable-nopud.h with my very limited knowledge of S2 page table walkers I fail to understand why we now can get rid of stage2_pgtable-nopmd.h and stage2_pgtable-nopud.h and associated FOLDED config. Please could you explain it in the commit message? > > diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h > index dbaf513..a351722 100644 > --- a/arch/arm64/include/asm/kvm_mmu.h > +++ b/arch/arm64/include/asm/kvm_mmu.h > @@ -21,6 +21,7 @@ > #include <asm/page.h> > #include <asm/memory.h> > #include <asm/cpufeature.h> > +#include <asm/kvm_arm.h> > > /* > * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express > @@ -147,6 +148,13 @@ static inline unsigned long __kern_hyp_va(unsigned long v) > #define kvm_phys_mask(kvm) (kvm_phys_size(kvm) - _AC(1, ULL)) > #define kvm_vttbr_baddr_mask(kvm) VTTBR_BADDR_MASK > > +static inline bool kvm_page_empty(void *ptr) > +{ > + struct page *ptr_page = virt_to_page(ptr); > + > + return page_count(ptr_page) == 1; > +} > + > #include <asm/stage2_pgtable.h> > > int create_hyp_mappings(void *from, void *to, pgprot_t prot); > @@ -237,12 +245,6 @@ static inline bool kvm_s2pmd_exec(pmd_t *pmdp) > return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); > } > > -static inline bool kvm_page_empty(void *ptr) > -{ > - struct page *ptr_page = virt_to_page(ptr); > - return page_count(ptr_page) == 1; > -} > - > #define hyp_pte_table_empty(ptep) kvm_page_empty(ptep) > > #ifdef __PAGETABLE_PMD_FOLDED > diff --git a/arch/arm64/include/asm/stage2_pgtable-nopmd.h b/arch/arm64/include/asm/stage2_pgtable-nopmd.h > deleted file mode 100644 > index 0280ded..0000000 > --- a/arch/arm64/include/asm/stage2_pgtable-nopmd.h > +++ /dev/null > @@ -1,42 +0,0 @@ > -/* > - * Copyright (C) 2016 - ARM Ltd > - * > - * This program is free software; you can redistribute it and/or modify > - * it under the terms of the GNU General Public License version 2 as > - * published by the Free Software Foundation. > - * > - * This program is distributed in the hope that it will be useful, > - * but WITHOUT ANY WARRANTY; without even the implied warranty of > - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > - * GNU General Public License for more details. > - * > - * You should have received a copy of the GNU General Public License > - * along with this program. If not, see <http://www.gnu.org/licenses/>. > - */ > - > -#ifndef __ARM64_S2_PGTABLE_NOPMD_H_ > -#define __ARM64_S2_PGTABLE_NOPMD_H_ > - > -#include <asm/stage2_pgtable-nopud.h> > - > -#define __S2_PGTABLE_PMD_FOLDED > - > -#define S2_PMD_SHIFT S2_PUD_SHIFT > -#define S2_PTRS_PER_PMD 1 > -#define S2_PMD_SIZE (1UL << S2_PMD_SHIFT) > -#define S2_PMD_MASK (~(S2_PMD_SIZE-1)) > - > -#define stage2_pud_none(kvm, pud) (0) > -#define stage2_pud_present(kvm, pud) (1) > -#define stage2_pud_clear(kvm, pud) do { } while (0) > -#define stage2_pud_populate(kvm, pud, pmd) do { } while (0) > -#define stage2_pmd_offset(kvm, pud, address) ((pmd_t *)(pud)) > - > -#define stage2_pmd_free(kvm, pmd) do { } while (0) > - > -#define stage2_pmd_addr_end(kvm, addr, end) (end) > - > -#define stage2_pud_huge(kvm, pud) (0) > -#define stage2_pmd_table_empty(kvm, pmdp) (0) > - > -#endif > diff --git a/arch/arm64/include/asm/stage2_pgtable-nopud.h b/arch/arm64/include/asm/stage2_pgtable-nopud.h > deleted file mode 100644 > index cd6304e..0000000 > --- a/arch/arm64/include/asm/stage2_pgtable-nopud.h > +++ /dev/null > @@ -1,39 +0,0 @@ > -/* > - * Copyright (C) 2016 - ARM Ltd > - * > - * This program is free software; you can redistribute it and/or modify > - * it under the terms of the GNU General Public License version 2 as > - * published by the Free Software Foundation. > - * > - * This program is distributed in the hope that it will be useful, > - * but WITHOUT ANY WARRANTY; without even the implied warranty of > - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > - * GNU General Public License for more details. > - * > - * You should have received a copy of the GNU General Public License > - * along with this program. If not, see <http://www.gnu.org/licenses/>. > - */ > - > -#ifndef __ARM64_S2_PGTABLE_NOPUD_H_ > -#define __ARM64_S2_PGTABLE_NOPUD_H_ > - > -#define __S2_PGTABLE_PUD_FOLDED > - > -#define S2_PUD_SHIFT S2_PGDIR_SHIFT > -#define S2_PTRS_PER_PUD 1 > -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) > -#define S2_PUD_MASK (~(S2_PUD_SIZE-1)) > - > -#define stage2_pgd_none(kvm, pgd) (0) > -#define stage2_pgd_present(kvm, pgd) (1) > -#define stage2_pgd_clear(kvm, pgd) do { } while (0) > -#define stage2_pgd_populate(kvm, pgd, pud) do { } while (0) > - > -#define stage2_pud_offset(kvm, pgd, address) ((pud_t *)(pgd)) > - > -#define stage2_pud_free(kvm, x) do { } while (0) > - > -#define stage2_pud_addr_end(kvm, addr, end) (end) > -#define stage2_pud_table_empty(kvm, pmdp) (0) > - > -#endif > diff --git a/arch/arm64/include/asm/stage2_pgtable.h b/arch/arm64/include/asm/stage2_pgtable.h > index 057a405..ffc37cc 100644 > --- a/arch/arm64/include/asm/stage2_pgtable.h > +++ b/arch/arm64/include/asm/stage2_pgtable.h > @@ -19,8 +19,12 @@ > #ifndef __ARM64_S2_PGTABLE_H_ > #define __ARM64_S2_PGTABLE_H_ > > +#include <linux/hugetlb.h> > #include <asm/pgtable.h> > > +/* The PGDIR shift for a given page table with "n" levels. */ > +#define pt_levels_pgdir_shift(n) ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - (n)) > + > /* > * The hardware supports concatenation of up to 16 tables at stage2 entry level > * and we use the feature whenever possible. > @@ -29,118 +33,209 @@ > * On arm64, the smallest PAGE_SIZE supported is 4k, which means > * (PAGE_SHIFT - 3) > 4 holds for all page sizes. Trying to understand that comment. Why do we compare to 4? > * This implies, the total number of page table levels at stage2 expected > - * by the hardware is actually the number of levels required for (KVM_PHYS_SHIFT - 4) > + * by the hardware is actually the number of levels required for (IPA_SHIFT - 4) although understandable, is IPA_SHIFT defined somewhere? > * in normal translations(e.g, stage1), since we cannot have another level in > - * the range (KVM_PHYS_SHIFT, KVM_PHYS_SHIFT - 4). > + * the range (IPA_SHIFT, IPA_SHIFT - 4). I fail to understand the above comment. Could you give a pointer to the spec? > */ > -#define STAGE2_PGTABLE_LEVELS ARM64_HW_PGTABLE_LEVELS(KVM_PHYS_SHIFT - 4) > +#define stage2_pt_levels(ipa_shift) ARM64_HW_PGTABLE_LEVELS((ipa_shift) - 4) > > /* > - * With all the supported VA_BITs and 40bit guest IPA, the following condition > - * is always true: > + * With all the supported VA_BITs and guest IPA, the following condition > + * must be always true: > * > - * STAGE2_PGTABLE_LEVELS <= CONFIG_PGTABLE_LEVELS > + * stage2_pt_levels <= CONFIG_PGTABLE_LEVELS > * > * We base our stage-2 page table walker helpers on this assumption and > * fall back to using the host version of the helper wherever possible. > * i.e, if a particular level is not folded (e.g, PUD) at stage2, we fall back > * to using the host version, since it is guaranteed it is not folded at host. > * > - * If the condition breaks in the future, we can rearrange the host level > - * definitions and reuse them for stage2. Till then... > + * If the condition breaks in the future, we need completely independent > + * page table helpers. Till then... > */ > -#if STAGE2_PGTABLE_LEVELS > CONFIG_PGTABLE_LEVELS > + > +#if stage2_pt_levels(KVM_PHYS_SHIFT) > CONFIG_PGTABLE_LEVELS > #error "Unsupported combination of guest IPA and host VA_BITS." > #endif > > -/* S2_PGDIR_SHIFT is the size mapped by top-level stage2 entry */ > -#define S2_PGDIR_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(4 - STAGE2_PGTABLE_LEVELS) > -#define S2_PGDIR_SIZE (_AC(1, UL) << S2_PGDIR_SHIFT) > -#define S2_PGDIR_MASK (~(S2_PGDIR_SIZE - 1)) > - > /* > * The number of PTRS across all concatenated stage2 tables given by the > * number of bits resolved at the initial level. > */ > -#define PTRS_PER_S2_PGD (1 << (KVM_PHYS_SHIFT - S2_PGDIR_SHIFT)) > +#define __s2_pgd_ptrs(pa, lvls) (1 << ((pa) - pt_levels_pgdir_shift((lvls)))) > +#define __s2_pgd_size(pa, lvls) (__s2_pgd_ptrs((pa), (lvls)) * sizeof(pgd_t)) > + > +#define kvm_stage2_levels(kvm) stage2_pt_levels(kvm_phys_shift(kvm)) > +#define stage2_pgdir_shift(kvm) \ > + pt_levels_pgdir_shift(kvm_stage2_levels(kvm)) > +#define stage2_pgdir_size(kvm) (_AC(1, UL) << stage2_pgdir_shift((kvm))) > +#define stage2_pgdir_mask(kvm) (~(stage2_pgdir_size((kvm)) - 1)) > +#define stage2_pgd_ptrs(kvm) \ > + __s2_pgd_ptrs(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) > + > +#define stage2_pgd_size(kvm) __s2_pgd_size(kvm_phys_shift(kvm), kvm_stage2_levels(kvm)) > > /* > * kvm_mmmu_cache_min_pages is the number of stage2 page table translation > * levels in addition to the PGD. > */ > -#define kvm_mmu_cache_min_pages(kvm) (STAGE2_PGTABLE_LEVELS - 1) > +#define kvm_mmu_cache_min_pages(kvm) (kvm_stage2_levels(kvm) - 1) > > > -#if STAGE2_PGTABLE_LEVELS > 3 > +/* PUD/PMD definitions if present */ > +#define __S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) > +#define __S2_PUD_SIZE (_AC(1, UL) << __S2_PUD_SHIFT) > +#define __S2_PUD_MASK (~(__S2_PUD_SIZE - 1)) > > -#define S2_PUD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(1) > -#define S2_PUD_SIZE (_AC(1, UL) << S2_PUD_SHIFT) > -#define S2_PUD_MASK (~(S2_PUD_SIZE - 1)) > +#define __S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) > +#define __S2_PMD_SIZE (_AC(1, UL) << __S2_PMD_SHIFT) > +#define __S2_PMD_MASK (~(__S2_PMD_SIZE - 1)) Is this renaming mandatory? > > -#define stage2_pgd_none(kvm, pgd) pgd_none(pgd) > -#define stage2_pgd_clear(kvm, pgd) pgd_clear(pgd) > -#define stage2_pgd_present(kvm, pgd) pgd_present(pgd) > -#define stage2_pgd_populate(kvm, pgd, pud) pgd_populate(NULL, pgd, pud) > -#define stage2_pud_offset(kvm, pgd, address) pud_offset(pgd, address) > -#define stage2_pud_free(kvm, pud) pud_free(NULL, pud) > +#define __s2_pud_index(addr) \ > + (((addr) >> __S2_PUD_SHIFT) & (PTRS_PER_PTE - 1)) > +#define __s2_pmd_index(addr) \ > + (((addr) >> __S2_PMD_SHIFT) & (PTRS_PER_PTE - 1)) > > -#define stage2_pud_table_empty(kvm, pudp) kvm_page_empty(pudp) > +#define __kvm_has_stage2_levels(kvm, min_levels) \ > + ((CONFIG_PGTABLE_LEVELS >= min_levels) && (kvm_stage2_levels(kvm) >= min_levels)) kvm_stage2_levels <= CONFIG_PGTABLE_LEVELS so you should just need to check kvm_stage2_levels? > + > +#define kvm_stage2_has_pgd(kvm) __kvm_has_stage2_levels(kvm, 4) > +#define kvm_stage2_has_pud(kvm) __kvm_has_stage2_levels(kvm, 3) > + > +static inline int stage2_pgd_none(struct kvm *kvm, pgd_t pgd) > +{ > + return kvm_stage2_has_pgd(kvm) ? pgd_none(pgd) : 0; > +} > + > +static inline void stage2_pgd_clear(struct kvm *kvm, pgd_t *pgdp) > +{ > + if (kvm_stage2_has_pgd(kvm)) > + pgd_clear(pgdp); > +} > + > +static inline int stage2_pgd_present(struct kvm *kvm, pgd_t pgd) > +{ > + return kvm_stage2_has_pgd(kvm) ? pgd_present(pgd) : 1; > +} > + > +static inline void stage2_pgd_populate(struct kvm *kvm, pgd_t *pgdp, pud_t *pud) > +{ > + if (kvm_stage2_has_pgd(kvm)) > + pgd_populate(NULL, pgdp, pud); > + else > + BUG(); > +} > + > +static inline pud_t *stage2_pud_offset(struct kvm *kvm, > + pgd_t *pgd, unsigned long address) > +{ > + if (kvm_stage2_has_pgd(kvm)) { > + phys_addr_t pud_phys = pgd_page_paddr(*pgd); > + > + pud_phys += __s2_pud_index(address) * sizeof(pud_t); > + return __va(pud_phys); > + } > + return (pud_t *)pgd; > +} > + > +static inline void stage2_pud_free(struct kvm *kvm, pud_t *pud) > +{ > + if (kvm_stage2_has_pgd(kvm)) > + pud_free(NULL, pud); > +} > + > +static inline int stage2_pud_table_empty(struct kvm *kvm, pud_t *pudp) > +{ > + return kvm_stage2_has_pgd(kvm) && kvm_page_empty(pudp); > +} > > static inline phys_addr_t > stage2_pud_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) > { > - phys_addr_t boundary = (addr + S2_PUD_SIZE) & S2_PUD_MASK; > + if (kvm_stage2_has_pgd(kvm)) { > + phys_addr_t boundary = (addr + __S2_PUD_SIZE) & __S2_PUD_MASK; > > - return (boundary - 1 < end - 1) ? boundary : end; > + return (boundary - 1 < end - 1) ? boundary : end; > + } > + return end; > } > > -#endif /* STAGE2_PGTABLE_LEVELS > 3 */ > +static inline int stage2_pud_none(struct kvm *kvm, pud_t pud) > +{ > + return kvm_stage2_has_pud(kvm) ? pud_none(pud) : 0; > +} > > +static inline void stage2_pud_clear(struct kvm *kvm, pud_t *pudp) > +{ > + if (kvm_stage2_has_pud(kvm)) > + pud_clear(pudp); > +} > > -#if STAGE2_PGTABLE_LEVELS > 2 > +static inline int stage2_pud_present(struct kvm *kvm, pud_t pud) > +{ > + return kvm_stage2_has_pud(kvm) ? pud_present(pud) : 1; > +} > > -#define S2_PMD_SHIFT ARM64_HW_PGTABLE_LEVEL_SHIFT(2) > -#define S2_PMD_SIZE (_AC(1, UL) << S2_PMD_SHIFT) > -#define S2_PMD_MASK (~(S2_PMD_SIZE - 1)) > +static inline void stage2_pud_populate(struct kvm *kvm, pud_t *pudp, pmd_t *pmd) > +{ > + if (kvm_stage2_has_pud(kvm)) > + pud_populate(NULL, pudp, pmd); > + else > + BUG(); > +} > > -#define stage2_pud_none(kvm, pud) pud_none(pud) > -#define stage2_pud_clear(kvm, pud) pud_clear(pud) > -#define stage2_pud_present(kvm, pud) pud_present(pud) > -#define stage2_pud_populate(kvm, pud, pmd) pud_populate(NULL, pud, pmd) > -#define stage2_pmd_offset(kvm, pud, address) pmd_offset(pud, address) > -#define stage2_pmd_free(kvm, pmd) pmd_free(NULL, pmd) > +static inline pmd_t *stage2_pmd_offset(struct kvm *kvm, > + pud_t *pud, unsigned long address) > +{ > + if (kvm_stage2_has_pud(kvm)) { > + phys_addr_t pmd_phys = pud_page_paddr(*pud); > > -#define stage2_pud_huge(kvm, pud) pud_huge(pud) > -#define stage2_pmd_table_empty(kvm, pmdp) kvm_page_empty(pmdp) > + pmd_phys += __s2_pmd_index(address) * sizeof(pmd_t); > + return __va(pmd_phys); > + } > + return (pmd_t *)pud; > +} > + > +static inline void stage2_pmd_free(struct kvm *kvm, pmd_t *pmd) > +{ > + if (kvm_stage2_has_pud(kvm)) > + pmd_free(NULL, pmd); > +} > + > +static inline int stage2_pmd_table_empty(struct kvm *kvm, pmd_t *pmdp) > +{ > + return kvm_stage2_has_pud(kvm) && kvm_page_empty(pmdp); > +} > > static inline phys_addr_t > stage2_pmd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) > { > - phys_addr_t boundary = (addr + S2_PMD_SIZE) & S2_PMD_MASK; > + if (kvm_stage2_has_pud(kvm)) { > + phys_addr_t boundary = (addr + __S2_PMD_SIZE) & __S2_PMD_MASK; > > - return (boundary - 1 < end - 1) ? boundary : end; > + return (boundary - 1 < end - 1) ? boundary : end; > + } > + return end; > } > > -#endif /* STAGE2_PGTABLE_LEVELS > 2 */ > +static inline int stage2_pud_huge(struct kvm *kvm, pud_t pud) > +{ > + return kvm_stage2_has_pud(kvm) ? pud_huge(pud) : 0; > +} > > #define stage2_pte_table_empty(kvm, ptep) kvm_page_empty(ptep) > > -#if STAGE2_PGTABLE_LEVELS == 2 > -#include <asm/stage2_pgtable-nopmd.h> > -#elif STAGE2_PGTABLE_LEVELS == 3 > -#include <asm/stage2_pgtable-nopud.h> > -#endif > - > -#define stage2_pgd_size(kvm) (PTRS_PER_S2_PGD * sizeof(pgd_t)) > - > -#define stage2_pgd_index(kvm, addr) \ > - (((addr) >> S2_PGDIR_SHIFT) & (PTRS_PER_S2_PGD - 1)) > +static inline unsigned long stage2_pgd_index(struct kvm *kvm, phys_addr_t addr) > +{ > + return (addr >> stage2_pgdir_shift(kvm)) & (stage2_pgd_ptrs(kvm) - 1); > +} > > static inline phys_addr_t > stage2_pgd_addr_end(struct kvm *kvm, phys_addr_t addr, phys_addr_t end) > { > - phys_addr_t boundary = (addr + S2_PGDIR_SIZE) & S2_PGDIR_MASK; > + phys_addr_t boundary; > > + boundary = (addr + stage2_pgdir_size(kvm)) & stage2_pgdir_mask(kvm); > return (boundary - 1 < end - 1) ? boundary : end; > } > > Globally this patch is pretty hard to review. I don't know if it is possible to split into 2. 1) Addition of some helper macros. 2) removal of nopud and nopmd and implementation of the corresponding macros? Thanks Eric