Re: [PATCH] pseries: Add support for new KVM hash table control call

Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> · Wed, 27 Jun 2012 22:12:26 +1000

On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote:
> From: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
> 
> This adds support for then new "reset htab" ioctl which allows qemu
> to properly cleanup the MMU hash table when the guest is reset. With
> the corresponding kernel support, reset of a guest now works properly.

Forgot to mention ... this depends on a newer linux kvm.h from Avi's
-next branch, so don't apply this patch to qemu until kvm.h had the
update adding the definitions for KVM_CAP_PPC_ALLOC_HTAB and
KVM_PPC_ALLOCATE_HTAB.

Cheers,
Ben.

> This also paves the way for indicating a different size hash table
> to the kernel and for the kernel to be able to impose limits on
> the requested size.
> 
> Signed-off-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
> ---
>  hw/spapr.c           |   88 ++++++++++++++++++++++++++++++++------------------
>  hw/spapr.h           |    2 +-
>  target-ppc/kvm.c     |   17 ++++++++++
>  target-ppc/kvm_ppc.h |    7 ++++
>  4 files changed, 82 insertions(+), 32 deletions(-)
> 
> diff --git a/hw/spapr.c b/hw/spapr.c
> index a6bc5e8..e19dbd8 100644
> --- a/hw/spapr.c
> +++ b/hw/spapr.c
> @@ -83,6 +83,8 @@
>  
>  #define PHANDLE_XICP            0x00001111
>  
> +#define HTAB_SIZE(spapr)        (1ULL << ((spapr)->htab_shift))
> +
>  sPAPREnvironment *spapr;
>  static int spapr_has_graphics;
>  
> @@ -111,12 +113,13 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num,
>      return qirq;
>  }
>  
> -static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
> +static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr)
>  {
>      int ret = 0, offset;
>      CPUPPCState *env;
>      char cpu_model[32];
>      int smt = kvmppc_smt_threads();
> +    uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)};
>  
>      assert(spapr->cpu_model);
>  
> @@ -140,8 +143,16 @@ static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr)
>              return offset;
>          }
>  
> -        ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> -                          sizeof(associativity));
> +        if (nb_numa_nodes > 1) {
> +            ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity,
> +                              sizeof(associativity));
> +            if (ret < 0) {
> +                return ret;
> +            }
> +        }
> +
> +        ret = fdt_setprop(fdt, offset, "ibm,pft-size",
> +                          pft_size_prop, sizeof(pft_size_prop));
>          if (ret < 0) {
>              return ret;
>          }
> @@ -189,15 +200,13 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>                                     target_phys_addr_t initrd_size,
>                                     target_phys_addr_t kernel_size,
>                                     const char *boot_device,
> -                                   const char *kernel_cmdline,
> -                                   long hash_shift)
> +                                   const char *kernel_cmdline)
>  {
>      void *fdt;
>      CPUPPCState *env;
>      uint64_t mem_reg_property[2];
>      uint32_t start_prop = cpu_to_be32(initrd_base);
>      uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size);
> -    uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)};
>      char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt"
>          "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk";
>      char qemu_hypertas_prop[] = "hcall-memop1";
> @@ -366,8 +375,6 @@ static void *spapr_create_fdt_skel(const char *cpu_model,
>          _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq)));
>          _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq)));
>          _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr)));
> -        _FDT((fdt_property(fdt, "ibm,pft-size",
> -                           pft_size_prop, sizeof(pft_size_prop))));
>          _FDT((fdt_property_string(fdt, "status", "okay")));
>          _FDT((fdt_property(fdt, "64-bit", NULL, 0)));
>  
> @@ -502,11 +509,9 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr,
>      }
>  
>      /* Advertise NUMA via ibm,associativity */
> -    if (nb_numa_nodes > 1) {
> -        ret = spapr_set_associativity(fdt, spapr);
> -        if (ret < 0) {
> -            fprintf(stderr, "Couldn't set up NUMA device tree properties\n");
> -        }
> +    ret = spapr_fixup_cpu_dt(fdt, spapr);
> +    if (ret < 0) {
> +        fprintf(stderr, "Couldn't finalize CPU device tree properties\n");
>      }
>  
>      if (!spapr_has_graphics) {
> @@ -536,12 +541,34 @@ static void emulate_spapr_hypercall(CPUPPCState *env)
>      env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]);
>  }
>  
> -static void spapr_reset(void *opaque)
> +static void spapr_reset_htab(void *opaque)
>  {
>      sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
> +    long shift;
> +
> +    /* allocate hash page table.  For now we always make this 16mb,
> +     * later we should probably make it scale to the size of guest
> +     * RAM */
> +
> +    shift = kvmppc_reset_htab(spapr->htab_shift);
> +
> +    if (shift > 0) {
> +        /* Kernel handles htab, we don't need to allocate one */
> +        spapr->htab_shift = shift;
> +    } else {
> +        if (!spapr->htab) {
> +            /* Allocate an htab if we don't yet have one */
> +            spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr));
> +        }
> +
> +        /* And clear it */
> +        memset(spapr->htab, 0, HTAB_SIZE(spapr));
> +    }
> +}
>  
> -    /* flush out the hash table */
> -    memset(spapr->htab, 0, spapr->htab_size);
> +static void spapr_reset(void *opaque)
> +{
> +    sPAPREnvironment *spapr = (sPAPREnvironment *)opaque;
>  
>      /* Load the fdt */
>      spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr,
> @@ -558,8 +585,16 @@ static void spapr_reset(void *opaque)
>  static void spapr_cpu_reset(void *opaque)
>  {
>      PowerPCCPU *cpu = opaque;
> +    CPUPPCState *env = &cpu->env;
>  
>      cpu_reset(CPU(cpu));
> +
> +    env->external_htab = spapr->htab;
> +    env->htab_base = -1;
> +    env->htab_mask = HTAB_SIZE(spapr) - 1;
> +
> +    env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
> +        (spapr->htab_shift - 18);
>  }
>  
>  static int spapr_vga_init(PCIBus *pci_bus)
> @@ -603,7 +638,6 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      uint32_t initrd_base = 0;
>      long kernel_size = 0, initrd_size = 0;
>      long load_limit, rtas_limit, fw_size;
> -    long pteg_shift = 17;
>      char *filename;
>  
>      spapr = g_malloc0(sizeof(*spapr));
> @@ -632,6 +666,11 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE;
>      load_limit = spapr->fdt_addr - FW_OVERHEAD;
>  
> +    /* For now, always aim for a 16MB hash table */
> +    /* FIXME: we should change this default based on RAM size */
> +    spapr->htab_shift = 24;
> +    qemu_register_reset(spapr_reset_htab, spapr);
> +
>      /* init CPUs */
>      if (cpu_model == NULL) {
>          cpu_model = kvm_enabled() ? "host" : "POWER7";
> @@ -664,20 +703,8 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>          memory_region_add_subregion(sysmem, nonrma_base, ram);
>      }
>  
> -    /* allocate hash page table.  For now we always make this 16mb,
> -     * later we should probably make it scale to the size of guest
> -     * RAM */
> -    spapr->htab_size = 1ULL << (pteg_shift + 7);
> -    spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size);
> -
>      for (env = first_cpu; env != NULL; env = env->next_cpu) {
> -        env->external_htab = spapr->htab;
> -        env->htab_base = -1;
> -        env->htab_mask = spapr->htab_size - 1;
> -
>          /* Tell KVM that we're in PAPR mode */
> -        env->spr[SPR_SDR1] = (unsigned long)spapr->htab |
> -                             ((pteg_shift + 7) - 18);
>          env->spr[SPR_HIOR] = 0;
>  
>          if (kvm_enabled()) {
> @@ -816,8 +843,7 @@ static void ppc_spapr_init(ram_addr_t ram_size,
>      spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size,
>                                              initrd_base, initrd_size,
>                                              kernel_size,
> -                                            boot_device, kernel_cmdline,
> -                                            pteg_shift + 7);
> +                                            boot_device, kernel_cmdline);
>      assert(spapr->fdt_skel != NULL);
>  
>      qemu_register_reset(spapr_reset, spapr);
> diff --git a/hw/spapr.h b/hw/spapr.h
> index 9153f29..7ec4d7c 100644
> --- a/hw/spapr.h
> +++ b/hw/spapr.h
> @@ -15,7 +15,7 @@ typedef struct sPAPREnvironment {
>  
>      target_phys_addr_t ram_limit;
>      void *htab;
> -    long htab_size;
> +    long htab_shift;
>      target_phys_addr_t fdt_addr, rtas_addr;
>      long rtas_size;
>      void *fdt_skel;
> diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c
> index 829e180..12ae0d7 100644
> --- a/target-ppc/kvm.c
> +++ b/target-ppc/kvm.c
> @@ -1101,6 +1101,23 @@ int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size)
>      return 0;
>  }
>  
> +int kvmppc_reset_htab(int shift_hint)
> +{
> +    uint32_t shift = shift_hint;
> +
> +    if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) {
> +        int ret;
> +        ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift);
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        return shift;
> +    }
> +
> +    /* For now.. */
> +    return 0;
> +}
> +
>  static inline uint32_t mfpvr(void)
>  {
>      uint32_t pvr;
> diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
> index e2f8703..b5be657 100644
> --- a/target-ppc/kvm_ppc.h
> +++ b/target-ppc/kvm_ppc.h
> @@ -27,6 +27,7 @@ int kvmppc_smt_threads(void);
>  off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem);
>  void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd);
>  int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size);
> +int kvmppc_reset_htab(int shift_hint);
>  #endif /* !CONFIG_USER_ONLY */
>  const ppc_def_t *kvmppc_host_cpu_def(void);
>  int kvmppc_fixup_cpu(CPUPPCState *env);
> @@ -94,6 +95,12 @@ static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
>  {
>      return -1;
>  }
> +
> +static inline int kvmppc_reset_htab(int shift_hint)
> +{
> +    return -1;
> +}
> +
>  #endif /* !CONFIG_USER_ONLY */
>  
>  static inline const ppc_def_t *kvmppc_host_cpu_def(void)
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html