On 15.05.2012, at 06:06, Benjamin Herrenschmidt wrote: > More recent Power server chips (i.e. based on the 64 bit hash MMU) > support more than just the traditional 4k and 16M page sizes. This > can get quite complicated, because which page sizes are supported, > which combinations are supported within an MMU segment and how these > page sizes are encoded both in the SLB entry and the hash PTE can vary > depending on the CPU model (they are not specified by the > architecture). In addition the firmware or hypervisor may not permit > use of certain page sizes, for various reasons. Whether various page > sizes are supported on KVM, for example, depends on whether the PR or > HV variant of KVM is in use, and on the page size of the memory > backing the guest's RAM. > > This patch adds information to the CPUState and cpu defs to describe > the supported page sizes and encodings. Since TCG does not yet > support any extended page sizes, we just set this to NULL in the > static CPU definitions, expanding this to the default 4k and 16M page > sizes when we initialize the cpu state. When using KVM, however, we > instead determine available page sizes using the new > KVM_PPC_GET_SMMU_INFO call. For old kernels without that call, we use > some defaults, with some guesswork which should do the right thing for > existing HV and PR implementations. The fallback might not be correct > for future versions, but that's ok, because they'll have > KVM_PPC_GET_SMMU_INFO. > > > Signed-off-by: Benjamin Herrenschmidt < > benh@xxxxxxxxxxxxxxxxxxx > > > Signed-off-by: David Gibson < > david@xxxxxxxxxxxxxxxxxxxxx > > > --- > v2: - Passes checkpatch now (with the exception of the kernel header > bit which will eventually be replaced by the real thing when > it goes upstream). > - Moved back some fixes that were incorrectly located in the > second patch (such as setting slb_size) > - Fix some issues when using "PR" KVM without proper support > for 1T segments > > Note: I kept the caching. It's not a "hot" path but still, it's a > lot of mess to do for every CPU (and we routinely have a bunch > on those power machines), in the end it adds up ... > > linux-headers/linux/kvm.h | 26 ++++++ > target-ppc/cpu.h | 31 +++++++ > target-ppc/helper.c | 7 ++ > target-ppc/kvm.c | 210 +++++++++++++++++++++++++++++++++++++++++++ > target-ppc/kvm_ppc.h | 5 + > target-ppc/translate_init.c | 21 +++++ > 6 files changed, 300 insertions(+), 0 deletions(-) > > diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h > index ee7bd9c..015b6db 100644 > --- a/linux-headers/linux/kvm.h > +++ b/linux-headers/linux/kvm.h > Please split Linux kernel header updates into their own patch, so I can easily redo them when necessary. Also, you sent this mail to the wrong mailing lists - it's a QEMU patch :). CC'ing the correct ones. > @@ -449,6 +449,30 @@ struct kvm_ppc_pvinfo { > __u8 pad[108]; > }; > > +/* for KVM_PPC_GET_SMMU_INFO */ > +#define KVM_PPC_PAGE_SIZES_MAX_SZ 8 > + > +struct kvm_ppc_one_page_size { > + __u32 page_shift; /* Page shift (or 0) */ > + __u32 pte_enc; /* Encoding in the HPTE (>>12) */ > +}; > + > +struct kvm_ppc_one_seg_page_size { > + __u32 page_shift; /* Base page shift of segment (or 0) */ > + __u32 slb_enc; /* SLB encoding for BookS */ > + struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ]; > +}; > + > +#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 > +#define KVM_PPC_1T_SEGMENTS 0x00000002 > + > +struct kvm_ppc_smmu_info { > + __u64 flags; > + __u32 slb_size; > + __u32 pad; > + struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ]; > +}; > + > #define KVMIO 0xAE > > /* machine type bits, to be used as argument to KVM_CREATE_VM */ > @@ -789,6 +813,8 @@ struct kvm_s390_ucas_mapping { > /* Available with KVM_CAP_PCI_2_3 */ > #define KVM_ASSIGN_SET_INTX_MASK _IOW(KVMIO, 0xa4, \ > struct kvm_assigned_pci_dev) > +/* Available with KVM_CAP_PPC_GET_SMMU_INFO */ > +#define KVM_PPC_GET_SMMU_INFO _IOR(KVMIO, 0xa5, struct kvm_ppc_smmu_info) > > /* > * ioctls for vcpu fds > diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h > index 84c9674..3ed75ac 100644 > --- a/target-ppc/cpu.h > +++ b/target-ppc/cpu.h > @@ -119,6 +119,8 @@ enum powerpc_mmu_t { > POWERPC_MMU_620 = POWERPC_MMU_64 | 0x00000002, > /* Architecture 2.06 variant */ > POWERPC_MMU_2_06 = POWERPC_MMU_64 | POWERPC_MMU_1TSEG | 0x00000003, > + /* Architecture 2.06 "degraded" (no 1T segments) */ > + POWERPC_MMU_2_06d = POWERPC_MMU_64 | 0x00000003, > #endif /* defined(TARGET_PPC64) */ > }; > > @@ -874,6 +876,29 @@ enum { > #define DBELL_PIRTAG_MASK 0x3fff > > /*****************************************************************************/ > +/* Segment page size information, used by recent hash MMUs > + * The format of this structure mirrors kvm_ppc_smmu_info > + */ > + > +#define PPC_PAGE_SIZES_MAX_SZ 8 > + > +struct ppc_one_page_size { > + uint32_t page_shift; /* Page shift (or 0) */ > + uint32_t pte_enc; /* Encoding in the HPTE (>>12) */ > +}; > + > +struct ppc_one_seg_page_size { > + uint32_t page_shift; /* Base page shift of segment (or 0) */ > + uint32_t slb_enc; /* SLB encoding for BookS */ > + struct ppc_one_page_size enc[PPC_PAGE_SIZES_MAX_SZ]; > +}; > + > +struct ppc_segment_page_sizes { > + struct ppc_one_seg_page_size sps[PPC_PAGE_SIZES_MAX_SZ]; > +}; > + > + > +/*****************************************************************************/ > /* The whole PowerPC CPU context */ > #define NB_MMU_MODES 3 > > @@ -889,6 +914,9 @@ struct ppc_def_t { > powerpc_input_t bus_model; > uint32_t flags; > int bfd_mach; > +#if defined(TARGET_PPC64) > + const struct ppc_segment_page_sizes *sps; > +#endif > void (*init_proc)(CPUPPCState *env); > int (*check_pow)(CPUPPCState *env); > }; > @@ -1012,6 +1040,9 @@ struct CPUPPCState { > uint32_t flags; > uint64_t insns_flags; > uint64_t insns_flags2; > +#if defined(TARGET_PPC64) > + struct ppc_segment_page_sizes sps; > +#endif > > #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY) > target_phys_addr_t vpa; > diff --git a/target-ppc/helper.c b/target-ppc/helper.c > index e97e496..833d948 100644 > --- a/target-ppc/helper.c > +++ b/target-ppc/helper.c > @@ -1617,6 +1617,7 @@ void dump_mmu(FILE *f, fprintf_function cpu_fprintf, CPUPPCState *env) > #if defined(TARGET_PPC64) > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > mmubooks_dump_mmu(f, cpu_fprintf, env); > break; > #endif > @@ -1647,6 +1648,7 @@ static inline int check_physical(CPUPPCState *env, mmu_ctx_t *ctx, > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > /* Real address are 60 bits long */ > ctx->raddr &= 0x0FFFFFFFFFFFFFFFULL; > ctx->prot |= PAGE_WRITE; > @@ -1727,6 +1729,7 @@ int get_physical_address (CPUPPCState *env, mmu_ctx_t *ctx, target_ulong eaddr, > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > #endif > if (ret < 0) { > /* We didn't match any BAT entry or don't have BATs */ > @@ -1867,6 +1870,7 @@ int cpu_ppc_handle_mmu_fault (CPUPPCState *env, target_ulong address, int rw, > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > #endif > env->exception_index = POWERPC_EXCP_ISI; > env->error_code = 0x40000000; > @@ -1977,6 +1981,7 @@ int cpu_ppc_handle_mmu_fault (CPUPPCState *env, target_ulong address, int rw, > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > #endif > env->exception_index = POWERPC_EXCP_DSI; > env->error_code = 0; > @@ -2299,6 +2304,7 @@ void ppc_tlb_invalidate_all (CPUPPCState *env) > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > #endif /* defined(TARGET_PPC64) */ > tlb_flush(env, 1); > break; > @@ -2367,6 +2373,7 @@ void ppc_tlb_invalidate_one (CPUPPCState *env, target_ulong addr) > case POWERPC_MMU_620: > case POWERPC_MMU_64B: > case POWERPC_MMU_2_06: > + case POWERPC_MMU_2_06d: > /* tlbie invalidate TLBs for all segments */ > /* XXX: given the fact that there are too many segments to invalidate, > * and we still don't have a tlb_flush_mask(env, n, mask) in QEMU, > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 0ab7630..2100cb9 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -18,6 +18,7 @@ > #include <sys/types.h> > #include <sys/ioctl.h> > #include <sys/mman.h> > +#include <sys/vfs.h> > > #include <linux/kvm.h> > > @@ -167,10 +168,219 @@ static int kvm_booke206_tlb_init(CPUPPCState *env) > return 0; > } > > + > +#if defined(TARGET_PPC64) > +static void kvm_get_fallback_smmu_info(CPUPPCState *env, > + struct kvm_ppc_smmu_info *info) > +{ > + memset(info, 0, sizeof(*info)); > + > + /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so > + * need to "guess" what the supported page sizes are. > + * > + * For that to work we make a few assumptions: > + * > + * - If KVM_CAP_PPC_GET_PVINFO is supported we are running "PR" > + * KVM which only supports 4K and 16M pages, but supports them > + * regardless of the backing store characteritics. We also don't > + * support 1T segments. > + * > + * This is safe as if HV KVM ever supports that capability or PR > + * KVM grows supports for more page/segment sizes, those versions > + * will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we > + * will not hit this fallback > + * > + * - Else we are running HV KVM. This means we only support page > + * sizes that fit in the backing store. Additionally we only > + * advertize 64K pages if the processor is ARCH 2.06 and we assume > + * P7 encodings for the SLB and hash table. Here too, we assume > + * support for any newer processor will mean a kernel that > + * implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit > + * this fallback. > + */ > + if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO)) { > + /* No flags */ > + info->flags = 0; > + info->slb_size = 64; > + > + /* Standard 4k base page size segment */ > + info->sps[0].page_shift = 12; > + info->sps[0].slb_enc = 0; > + info->sps[0].enc[0].page_shift = 12; > + info->sps[0].enc[0].pte_enc = 0; > + > + /* Standard 16M large page size segment */ > + info->sps[1].page_shift = 24; > + info->sps[1].slb_enc = SLB_VSID_L; > + info->sps[1].enc[0].page_shift = 24; > + info->sps[1].enc[0].pte_enc = 0; > + } else { > + int i = 0; > + > + /* HV KVM has backing store size restrictions */ > + info->flags = KVM_PPC_PAGE_SIZES_REAL; > + > + if (env->mmu_model & POWERPC_MMU_1TSEG) { > + info->flags |= KVM_PPC_1T_SEGMENTS; > + } > + > + if (env->mmu_model == POWERPC_MMU_2_06) { > + info->slb_size = 32; > + } else { > + info->slb_size = 64; > + } > + > + /* Standard 4k base page size segment */ > + info->sps[i].page_shift = 12; > + info->sps[i].slb_enc = 0; > + info->sps[i].enc[0].page_shift = 12; > + info->sps[i].enc[0].pte_enc = 0; > + i++; > + > + /* 64K on MMU 2.06 */ > + if (env->mmu_model == POWERPC_MMU_2_06) { > + info->sps[i].page_shift = 16; > + info->sps[i].slb_enc = 0x110; > + info->sps[i].enc[0].page_shift = 16; > + info->sps[i].enc[0].pte_enc = 1; > + i++; > + } > + > + /* Standard 16M large page size segment */ > + info->sps[i].page_shift = 24; > + info->sps[i].slb_enc = SLB_VSID_L; > + info->sps[i].enc[0].page_shift = 24; > + info->sps[i].enc[0].pte_enc = 0; > + } > +} > + > +static void kvm_get_smmu_info(CPUPPCState *env, struct kvm_ppc_smmu_info *info) > +{ > +#ifdef KVM_CAP_PPC_GET_SMMU_INFO > No need to for the #ifdef anymore, because we're syncing the headers now. > + int ret; > + > + if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) { > + ret = kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_SMMU_INFO, info); > + if (ret == 0) { > + return; > + } > + } > +#endif /* KVM_CAP_PPC_GET_SMMU_INFO */ > + > + kvm_get_fallback_smmu_info(env, info); > +} > + > +static long getrampagesize(void) > +{ > + struct statfs fs; > + int ret; > + > + if (!mem_path) { > + /* guest RAM is backed by normal anonymous pages */ > + return getpagesize(); > + } > + > + do { > + ret = statfs(mem_path, &fs); > + } while (ret != 0 && errno == EINTR); > + > + if (ret != 0) { > + fprintf(stderr, "Couldn't statfs() memory path: %s\n", > + strerror(errno)); > + exit(1); > + } > + > +#define HUGETLBFS_MAGIC 0x958458f6 > + > + if (fs.f_type != HUGETLBFS_MAGIC) { > + /* Explicit mempath, but it's ordinary pages */ > + return getpagesize(); > + } > + > + /* It's hugepage, return the huge page size */ > + return fs.f_bsize; > +} > + > +static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift) > +{ > + if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) { > + return true; > + } > + > + return (1ul << shift) <= rampgsize; > +} > + > +static void kvm_fixup_page_sizes(CPUPPCState *env) > +{ > + static struct kvm_ppc_smmu_info smmu_info; > + static bool has_smmu_info; > + long rampagesize; > + int iq, ik, jq, jk; > + > + /* We only handle page sizes for 64-bit server guests for now */ > + if (!(env->mmu_model & POWERPC_MMU_64)) { > + return; > + } > + > + /* Collect MMU info from kernel if not already */ > + if (!has_smmu_info) { > + kvm_get_smmu_info(env, &smmu_info); > + has_smmu_info = true; > + } > + > + rampagesize = getrampagesize(); > + > + /* Convert to QEMU form */ > + memset(&env->sps, 0, sizeof(env->sps)); > + > + for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) { > + struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq]; > + struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik]; > + > + if (!kvm_valid_page_size(smmu_info.flags, rampagesize, > + ksps->page_shift)) { > + continue; > + } > + qsps->page_shift = ksps->page_shift; > + qsps->slb_enc = ksps->slb_enc; > + for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) { > + if (!kvm_valid_page_size(smmu_info.flags, rampagesize, > + ksps->enc[jk].page_shift)) { > + continue; > + } > + qsps->enc[jq].page_shift = ksps->enc[jk].page_shift; > + qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc; > + if (++jq >= PPC_PAGE_SIZES_MAX_SZ) { > + break; > + } > + } > + if (++iq >= PPC_PAGE_SIZES_MAX_SZ) { > + break; > + } > + } > + env->slb_nr = smmu_info.slb_size; > + if (smmu_info.flags & KVM_PPC_1T_SEGMENTS) { > + env->mmu_model |= POWERPC_MMU_1TSEG; > + } else { > + env->mmu_model &= ~POWERPC_MMU_1TSEG; > + } > +} > +#else /* defined (TARGET_PPC64) */ > + > +static inline void kvm_fixup_page_sizes(CPUPPCState *env) > +{ > +} > + > +#endif /* !defined (TARGET_PPC64) */ > + > int kvm_arch_init_vcpu(CPUPPCState *cenv) > { > int ret; > > + /* Gather server mmu info from KVM and update the CPU state*/ > Missing space? :) Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html