Re: [PATCH v2 1/2] ppc64: Rudimentary Support for extra page sizes on server CPUs

Alexander Graf <agraf@xxxxxxx> · Mon, 21 May 2012 09:39:24 +0200

On 15.05.2012, at 06:06, Benjamin Herrenschmidt wrote:

> More recent Power server chips (i.e. based on the 64 bit hash MMU)
> support more than just the traditional 4k and 16M page sizes.  This
> can get quite complicated, because which page sizes are supported,
> which combinations are supported within an MMU segment and how these
> page sizes are encoded both in the SLB entry and the hash PTE can vary
> depending on the CPU model (they are not specified by the
> architecture).  In addition the firmware or hypervisor may not permit
> use of certain page sizes, for various reasons.  Whether various page
> sizes are supported on KVM, for example, depends on whether the PR or
> HV variant of KVM is in use, and on the page size of the memory
> backing the guest's RAM.
> 
> This patch adds information to the CPUState and cpu defs to describe
> the supported page sizes and encodings.  Since TCG does not yet
> support any extended page sizes, we just set this to NULL in the
> static CPU definitions, expanding this to the default 4k and 16M page
> sizes when we initialize the cpu state.  When using KVM, however, we
> instead determine available page sizes using the new
> KVM_PPC_GET_SMMU_INFO call.  For old kernels without that call, we use
> some defaults, with some guesswork which should do the right thing for
> existing HV and PR implementations.  The fallback might not be correct
> for future versions, but that's ok, because they'll have
> KVM_PPC_GET_SMMU_INFO.
> 
> 
> Signed-off-by: Benjamin Herrenschmidt <
> benh@xxxxxxxxxxxxxxxxxxx
> >
> Signed-off-by: David Gibson <
> david@xxxxxxxxxxxxxxxxxxxxx
> >
> ---
> v2: - Passes checkpatch now (with the exception of the kernel header
>       bit which will eventually be replaced by the real thing when
>       it goes upstream).
>     - Moved back some fixes that were incorrectly located in the
>       second patch (such as setting slb_size)
>     - Fix some issues when using "PR" KVM without proper support
>       for 1T segments
> 
> Note: I kept the caching. It's not a "hot" path but still, it's a
>       lot of mess to do for every CPU (and we routinely have a bunch
>       on those power machines), in the end it adds up ...
> 
>  linux-headers/linux/kvm.h   |   26 ++++++
>  target-ppc/cpu.h            |   31 +++++++
>  target-ppc/helper.c         |    7 ++
>  target-ppc/kvm.c            |  210 +++++++++++++++++++++++++++++++++++++++++++
>  target-ppc/kvm_ppc.h        |    5 +
>  target-ppc/translate_init.c |   21 +++++
>  6 files changed, 300 insertions(+), 0 deletions(-)
> 
> diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h
> index ee7bd9c..015b6db 100644
> --- a/linux-headers/linux/kvm.h
> +++ b/linux-headers/linux/kvm.h
> 

Please split Linux kernel header updates into their own patch, so I can easily redo them when necessary.

Also, you sent this mail to the wrong mailing lists - it's a QEMU patch :). CC'ing the correct ones.

> @@ -449,6 +449,30 @@ struct kvm_ppc_pvinfo {
>  	__u8  pad[108];
>  };
>  
> +/* for KVM_PPC_GET_SMMU_INFO */
> +#define KVM_PPC_PAGE_SIZES_MAX_SZ	8
> +
> +struct kvm_ppc_one_page_size {
> +	__u32 page_shift;	/* Page shift (or 0) */
> +	__u32 pte_enc;		/* Encoding in the HPTE (>>12) */
> +};
> +
> +struct kvm_ppc_one_seg_page_size {
> +	__u32 page_shift;	/* Base page shift of segment (or 0) */
> +	__u32 slb_enc;		/* SLB encoding for BookS */
> +	struct kvm_ppc_one_page_size enc[KVM_PPC_PAGE_SIZES_MAX_SZ];
> +};
> +
> +#define KVM_PPC_PAGE_SIZES_REAL		0x00000001
> +#define KVM_PPC_1T_SEGMENTS		0x00000002
> +
> +struct kvm_ppc_smmu_info {
> +	__u64 flags;
> +	__u32 slb_size;
> +	__u32 pad;
> +	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
> +};
> +
>  #define KVMIO 0xAE
>  
>  /* machine type bits, to be used as argument to KVM_CREATE_VM */
> @@ -789,6 +813,8 @@ struct kvm_s390_ucas_mapping {
>  /* Available with KVM_CAP_PCI_2_3 */
>  #define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
>  				       struct kvm_assigned_pci_dev)
> +/* Available with KVM_CAP_PPC_GET_SMMU_INFO */
> +#define KVM_PPC_GET_SMMU_INFO     _IOR(KVMIO,  0xa5, struct kvm_ppc_smmu_info)
>  
>  /*
>   * ioctls for vcpu fds
> diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
> index 84c9674..3ed75ac 100644
> --- a/target-ppc/cpu.h
> +++ b/target-ppc/cpu.h
> @@ -119,6 +119,8 @@ enum powerpc_mmu_t {
>      POWERPC_MMU_620        = POWERPC_MMU_64 | 0x00000002,
>      /* Architecture 2.06 variant                               */
>      POWERPC_MMU_2_06       = POWERPC_MMU_64 | POWERPC_MMU_1TSEG | 0x00000003,
> +    /* Architecture 2.06 "degraded" (no 1T segments)           */
> +    POWERPC_MMU_2_06d      = POWERPC_MMU_64 | 0x00000003,
>  #endif /* defined(TARGET_PPC64) */
>  };
>  
> @@ -874,6 +876,29 @@ enum {
>  #define DBELL_PIRTAG_MASK              0x3fff
>  
>  /*****************************************************************************/
> +/* Segment page size information, used by recent hash MMUs
> + * The format of this structure mirrors kvm_ppc_smmu_info
> + */
> +
> +#define PPC_PAGE_SIZES_MAX_SZ   8
> +
> +struct ppc_one_page_size {
> +    uint32_t page_shift;  /* Page shift (or 0) */
> +    uint32_t pte_enc;     /* Encoding in the HPTE (>>12) */
> +};
> +
> +struct ppc_one_seg_page_size {
> +    uint32_t page_shift;  /* Base page shift of segment (or 0) */
> +    uint32_t slb_enc;     /* SLB encoding for BookS */
> +    struct ppc_one_page_size enc[PPC_PAGE_SIZES_MAX_SZ];
> +};
> +
> +struct ppc_segment_page_sizes {
> +    struct ppc_one_seg_page_size sps[PPC_PAGE_SIZES_MAX_SZ];
> +};
> +
> +
> +/*****************************************************************************/
>  /* The whole PowerPC CPU context */
>  #define NB_MMU_MODES 3
>  
> @@ -889,6 +914,9 @@ struct ppc_def_t {
>      powerpc_input_t bus_model;
>      uint32_t flags;
>      int bfd_mach;
> +#if defined(TARGET_PPC64)
> +    const struct ppc_segment_page_sizes *sps;
> +#endif
>      void (*init_proc)(CPUPPCState *env);
>      int  (*check_pow)(CPUPPCState *env);
>  };
> @@ -1012,6 +1040,9 @@ struct CPUPPCState {
>      uint32_t flags;
>      uint64_t insns_flags;
>      uint64_t insns_flags2;
> +#if defined(TARGET_PPC64)
> +    struct ppc_segment_page_sizes sps;
> +#endif
>  
>  #if defined(TARGET_PPC64) && !defined(CONFIG_USER_ONLY)
>      target_phys_addr_t vpa;
> diff --git a/target-ppc/helper.c b/target-ppc/helper.c
> index e97e496..833d948 100644
> --- a/target-ppc/helper.c
> +++ b/target-ppc/helper.c
> @@ -1617,6 +1617,7 @@ void dump_mmu(FILE *f, fprintf_function cpu_fprintf, CPUPPCState *env)
>  #if defined(TARGET_PPC64)
>      case POWERPC_MMU_64B:
>      case POWERPC_MMU_2_06:
> +    case POWERPC_MMU_2_06d:
>          mmubooks_dump_mmu(f, cpu_fprintf, env);
>          break;
>  #endif
> @@ -1647,6 +1648,7 @@ static inline int check_physical(CPUPPCState *env, mmu_ctx_t *ctx,
>      case POWERPC_MMU_620:
>      case POWERPC_MMU_64B:
>      case POWERPC_MMU_2_06:
> +    case POWERPC_MMU_2_06d:
>          /* Real address are 60 bits long */
>          ctx->raddr &= 0x0FFFFFFFFFFFFFFFULL;
>          ctx->prot |= PAGE_WRITE;
> @@ -1727,6 +1729,7 @@ int get_physical_address (CPUPPCState *env, mmu_ctx_t *ctx, target_ulong eaddr,
>          case POWERPC_MMU_620:
>          case POWERPC_MMU_64B:
>          case POWERPC_MMU_2_06:
> +        case POWERPC_MMU_2_06d:
>  #endif
>              if (ret < 0) {
>                  /* We didn't match any BAT entry or don't have BATs */
> @@ -1867,6 +1870,7 @@ int cpu_ppc_handle_mmu_fault (CPUPPCState *env, target_ulong address, int rw,
>                  case POWERPC_MMU_620:
>                  case POWERPC_MMU_64B:
>                  case POWERPC_MMU_2_06:
> +                case POWERPC_MMU_2_06d:
>  #endif
>                      env->exception_index = POWERPC_EXCP_ISI;
>                      env->error_code = 0x40000000;
> @@ -1977,6 +1981,7 @@ int cpu_ppc_handle_mmu_fault (CPUPPCState *env, target_ulong address, int rw,
>                  case POWERPC_MMU_620:
>                  case POWERPC_MMU_64B:
>                  case POWERPC_MMU_2_06:
> +                case POWERPC_MMU_2_06d:
>  #endif
>                      env->exception_index = POWERPC_EXCP_DSI;
>                      env->error_code = 0;
> @@ -2299,6 +2304,7 @@ void ppc_tlb_invalidate_all (CPUPPCState *env)
>      case POWERPC_MMU_620:
>      case POWERPC_MMU_64B:
>      case POWERPC_MMU_2_06:
> +    case POWERPC_MMU_2_06d:
>  #endif /* defined(TARGET_PPC64) */
>          tlb_flush(env, 1);
>          break;
> @@ -2367,6 +2373,7 @@ void ppc_tlb_invalidate_one (CPUPPCState *env, target_ulong addr)
>      case POWERPC_MMU_620:
>      case POWERPC_MMU_64B:
>      case POWERPC_MMU_2_06:
> +    case POWERPC_MMU_2_06d:
>          /* tlbie invalidate TLBs for all segments */
>          /* XXX: given the fact that there are too many segments to invalidate,
>           *      and we still don't have a tlb_flush_mask(env, n, mask) in QEMU,
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 0ab7630..2100cb9 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -18,6 +18,7 @@
>  #include <sys/types.h>
>  #include <sys/ioctl.h>
>  #include <sys/mman.h>
> +#include <sys/vfs.h>
>  
>  #include <linux/kvm.h>
>  
> @@ -167,10 +168,219 @@ static int kvm_booke206_tlb_init(CPUPPCState *env)
>      return 0;
>  }
>  
> +
> +#if defined(TARGET_PPC64)
> +static void kvm_get_fallback_smmu_info(CPUPPCState *env,
> +                                       struct kvm_ppc_smmu_info *info)
> +{
> +    memset(info, 0, sizeof(*info));
> +
> +    /* We don't have the new KVM_PPC_GET_SMMU_INFO ioctl, so
> +     * need to "guess" what the supported page sizes are.
> +     *
> +     * For that to work we make a few assumptions:
> +     *
> +     * - If KVM_CAP_PPC_GET_PVINFO is supported we are running "PR"
> +     *   KVM which only supports 4K and 16M pages, but supports them
> +     *   regardless of the backing store characteritics. We also don't
> +     *   support 1T segments.
> +     *
> +     *   This is safe as if HV KVM ever supports that capability or PR
> +     *   KVM grows supports for more page/segment sizes, those versions
> +     *   will have implemented KVM_CAP_PPC_GET_SMMU_INFO and thus we
> +     *   will not hit this fallback
> +     *
> +     * - Else we are running HV KVM. This means we only support page
> +     *   sizes that fit in the backing store. Additionally we only
> +     *   advertize 64K pages if the processor is ARCH 2.06 and we assume
> +     *   P7 encodings for the SLB and hash table. Here too, we assume
> +     *   support for any newer processor will mean a kernel that
> +     *   implements KVM_CAP_PPC_GET_SMMU_INFO and thus doesn't hit
> +     *   this fallback.
> +     */
> +    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_PVINFO)) {
> +        /* No flags */
> +        info->flags = 0;
> +        info->slb_size = 64;
> +
> +        /* Standard 4k base page size segment */
> +        info->sps[0].page_shift = 12;
> +        info->sps[0].slb_enc = 0;
> +        info->sps[0].enc[0].page_shift = 12;
> +        info->sps[0].enc[0].pte_enc = 0;
> +
> +        /* Standard 16M large page size segment */
> +        info->sps[1].page_shift = 24;
> +        info->sps[1].slb_enc = SLB_VSID_L;
> +        info->sps[1].enc[0].page_shift = 24;
> +        info->sps[1].enc[0].pte_enc = 0;
> +    } else {
> +        int i = 0;
> +
> +        /* HV KVM has backing store size restrictions */
> +        info->flags = KVM_PPC_PAGE_SIZES_REAL;
> +
> +        if (env->mmu_model & POWERPC_MMU_1TSEG) {
> +            info->flags |= KVM_PPC_1T_SEGMENTS;
> +        }
> +
> +        if (env->mmu_model == POWERPC_MMU_2_06) {
> +            info->slb_size = 32;
> +        } else {
> +            info->slb_size = 64;
> +        }
> +
> +        /* Standard 4k base page size segment */
> +        info->sps[i].page_shift = 12;
> +        info->sps[i].slb_enc = 0;
> +        info->sps[i].enc[0].page_shift = 12;
> +        info->sps[i].enc[0].pte_enc = 0;
> +        i++;
> +
> +        /* 64K on MMU 2.06 */
> +        if (env->mmu_model == POWERPC_MMU_2_06) {
> +            info->sps[i].page_shift = 16;
> +            info->sps[i].slb_enc = 0x110;
> +            info->sps[i].enc[0].page_shift = 16;
> +            info->sps[i].enc[0].pte_enc = 1;
> +            i++;
> +        }
> +
> +        /* Standard 16M large page size segment */
> +        info->sps[i].page_shift = 24;
> +        info->sps[i].slb_enc = SLB_VSID_L;
> +        info->sps[i].enc[0].page_shift = 24;
> +        info->sps[i].enc[0].pte_enc = 0;
> +    }
> +}
> +
> +static void kvm_get_smmu_info(CPUPPCState *env, struct kvm_ppc_smmu_info *info)
> +{
> +#ifdef KVM_CAP_PPC_GET_SMMU_INFO
> 

No need to for the #ifdef anymore, because we're syncing the headers now.

> +    int ret;
> +
> +    if (kvm_check_extension(env->kvm_state, KVM_CAP_PPC_GET_SMMU_INFO)) {
> +        ret = kvm_vm_ioctl(env->kvm_state, KVM_PPC_GET_SMMU_INFO, info);
> +        if (ret == 0) {
> +            return;
> +        }
> +    }
> +#endif /* KVM_CAP_PPC_GET_SMMU_INFO */
> +
> +    kvm_get_fallback_smmu_info(env, info);
> +}
> +
> +static long getrampagesize(void)
> +{
> +    struct statfs fs;
> +    int ret;
> +
> +    if (!mem_path) {
> +        /* guest RAM is backed by normal anonymous pages */
> +        return getpagesize();
> +    }
> +
> +    do {
> +        ret = statfs(mem_path, &fs);
> +    } while (ret != 0 && errno == EINTR);
> +
> +    if (ret != 0) {
> +        fprintf(stderr, "Couldn't statfs() memory path: %s\n",
> +                strerror(errno));
> +        exit(1);
> +    }
> +
> +#define HUGETLBFS_MAGIC       0x958458f6
> +
> +    if (fs.f_type != HUGETLBFS_MAGIC) {
> +        /* Explicit mempath, but it's ordinary pages */
> +        return getpagesize();
> +    }
> +
> +    /* It's hugepage, return the huge page size */
> +    return fs.f_bsize;
> +}
> +
> +static bool kvm_valid_page_size(uint32_t flags, long rampgsize, uint32_t shift)
> +{
> +    if (!(flags & KVM_PPC_PAGE_SIZES_REAL)) {
> +        return true;
> +    }
> +
> +    return (1ul << shift) <= rampgsize;
> +}
> +
> +static void kvm_fixup_page_sizes(CPUPPCState *env)
> +{
> +    static struct kvm_ppc_smmu_info smmu_info;
> +    static bool has_smmu_info;
> +    long rampagesize;
> +    int iq, ik, jq, jk;
> +
> +    /* We only handle page sizes for 64-bit server guests for now */
> +    if (!(env->mmu_model & POWERPC_MMU_64)) {
> +        return;
> +    }
> +
> +    /* Collect MMU info from kernel if not already */
> +    if (!has_smmu_info) {
> +        kvm_get_smmu_info(env, &smmu_info);
> +        has_smmu_info = true;
> +    }
> +
> +    rampagesize = getrampagesize();
> +
> +    /* Convert to QEMU form */
> +    memset(&env->sps, 0, sizeof(env->sps));
> +
> +    for (ik = iq = 0; ik < KVM_PPC_PAGE_SIZES_MAX_SZ; ik++) {
> +        struct ppc_one_seg_page_size *qsps = &env->sps.sps[iq];
> +        struct kvm_ppc_one_seg_page_size *ksps = &smmu_info.sps[ik];
> +
> +        if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
> +                                 ksps->page_shift)) {
> +            continue;
> +        }
> +        qsps->page_shift = ksps->page_shift;
> +        qsps->slb_enc = ksps->slb_enc;
> +        for (jk = jq = 0; jk < KVM_PPC_PAGE_SIZES_MAX_SZ; jk++) {
> +            if (!kvm_valid_page_size(smmu_info.flags, rampagesize,
> +                                     ksps->enc[jk].page_shift)) {
> +                continue;
> +            }
> +            qsps->enc[jq].page_shift = ksps->enc[jk].page_shift;
> +            qsps->enc[jq].pte_enc = ksps->enc[jk].pte_enc;
> +            if (++jq >= PPC_PAGE_SIZES_MAX_SZ) {
> +                break;
> +            }
> +        }
> +        if (++iq >= PPC_PAGE_SIZES_MAX_SZ) {
> +            break;
> +        }
> +    }
> +    env->slb_nr = smmu_info.slb_size;
> +    if (smmu_info.flags & KVM_PPC_1T_SEGMENTS) {
> +        env->mmu_model |= POWERPC_MMU_1TSEG;
> +    } else {
> +        env->mmu_model &= ~POWERPC_MMU_1TSEG;
> +    }
> +}
> +#else /* defined (TARGET_PPC64) */
> +
> +static inline void kvm_fixup_page_sizes(CPUPPCState *env)
> +{
> +}
> +
> +#endif /* !defined (TARGET_PPC64) */
> +
>  int kvm_arch_init_vcpu(CPUPPCState *cenv)
>  {
>      int ret;
>  
> +    /* Gather server mmu info from KVM and update the CPU state*/
> 

Missing space? :)

Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html