On Wed, 2012-06-27 at 22:10 +1000, Benjamin Herrenschmidt wrote: > From: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > > This adds support for then new "reset htab" ioctl which allows qemu > to properly cleanup the MMU hash table when the guest is reset. With > the corresponding kernel support, reset of a guest now works properly. Forgot to mention ... this depends on a newer linux kvm.h from Avi's -next branch, so don't apply this patch to qemu until kvm.h had the update adding the definitions for KVM_CAP_PPC_ALLOC_HTAB and KVM_PPC_ALLOCATE_HTAB. Cheers, Ben. > This also paves the way for indicating a different size hash table > to the kernel and for the kernel to be able to impose limits on > the requested size. > > Signed-off-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > Signed-off-by: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx> > --- > hw/spapr.c | 88 ++++++++++++++++++++++++++++++++------------------ > hw/spapr.h | 2 +- > target-ppc/kvm.c | 17 ++++++++++ > target-ppc/kvm_ppc.h | 7 ++++ > 4 files changed, 82 insertions(+), 32 deletions(-) > > diff --git a/hw/spapr.c b/hw/spapr.c > index a6bc5e8..e19dbd8 100644 > --- a/hw/spapr.c > +++ b/hw/spapr.c > @@ -83,6 +83,8 @@ > > #define PHANDLE_XICP 0x00001111 > > +#define HTAB_SIZE(spapr) (1ULL << ((spapr)->htab_shift)) > + > sPAPREnvironment *spapr; > static int spapr_has_graphics; > > @@ -111,12 +113,13 @@ qemu_irq spapr_allocate_irq(uint32_t hint, uint32_t *irq_num, > return qirq; > } > > -static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr) > +static int spapr_fixup_cpu_dt(void *fdt, sPAPREnvironment *spapr) > { > int ret = 0, offset; > CPUPPCState *env; > char cpu_model[32]; > int smt = kvmppc_smt_threads(); > + uint32_t pft_size_prop[] = {0, cpu_to_be32(spapr->htab_shift)}; > > assert(spapr->cpu_model); > > @@ -140,8 +143,16 @@ static int spapr_set_associativity(void *fdt, sPAPREnvironment *spapr) > return offset; > } > > - ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity, > - sizeof(associativity)); > + if (nb_numa_nodes > 1) { > + ret = fdt_setprop(fdt, offset, "ibm,associativity", associativity, > + sizeof(associativity)); > + if (ret < 0) { > + return ret; > + } > + } > + > + ret = fdt_setprop(fdt, offset, "ibm,pft-size", > + pft_size_prop, sizeof(pft_size_prop)); > if (ret < 0) { > return ret; > } > @@ -189,15 +200,13 @@ static void *spapr_create_fdt_skel(const char *cpu_model, > target_phys_addr_t initrd_size, > target_phys_addr_t kernel_size, > const char *boot_device, > - const char *kernel_cmdline, > - long hash_shift) > + const char *kernel_cmdline) > { > void *fdt; > CPUPPCState *env; > uint64_t mem_reg_property[2]; > uint32_t start_prop = cpu_to_be32(initrd_base); > uint32_t end_prop = cpu_to_be32(initrd_base + initrd_size); > - uint32_t pft_size_prop[] = {0, cpu_to_be32(hash_shift)}; > char hypertas_prop[] = "hcall-pft\0hcall-term\0hcall-dabr\0hcall-interrupt" > "\0hcall-tce\0hcall-vio\0hcall-splpar\0hcall-bulk"; > char qemu_hypertas_prop[] = "hcall-memop1"; > @@ -366,8 +375,6 @@ static void *spapr_create_fdt_skel(const char *cpu_model, > _FDT((fdt_property_cell(fdt, "timebase-frequency", tbfreq))); > _FDT((fdt_property_cell(fdt, "clock-frequency", cpufreq))); > _FDT((fdt_property_cell(fdt, "ibm,slb-size", env->slb_nr))); > - _FDT((fdt_property(fdt, "ibm,pft-size", > - pft_size_prop, sizeof(pft_size_prop)))); > _FDT((fdt_property_string(fdt, "status", "okay"))); > _FDT((fdt_property(fdt, "64-bit", NULL, 0))); > > @@ -502,11 +509,9 @@ static void spapr_finalize_fdt(sPAPREnvironment *spapr, > } > > /* Advertise NUMA via ibm,associativity */ > - if (nb_numa_nodes > 1) { > - ret = spapr_set_associativity(fdt, spapr); > - if (ret < 0) { > - fprintf(stderr, "Couldn't set up NUMA device tree properties\n"); > - } > + ret = spapr_fixup_cpu_dt(fdt, spapr); > + if (ret < 0) { > + fprintf(stderr, "Couldn't finalize CPU device tree properties\n"); > } > > if (!spapr_has_graphics) { > @@ -536,12 +541,34 @@ static void emulate_spapr_hypercall(CPUPPCState *env) > env->gpr[3] = spapr_hypercall(env, env->gpr[3], &env->gpr[4]); > } > > -static void spapr_reset(void *opaque) > +static void spapr_reset_htab(void *opaque) > { > sPAPREnvironment *spapr = (sPAPREnvironment *)opaque; > + long shift; > + > + /* allocate hash page table. For now we always make this 16mb, > + * later we should probably make it scale to the size of guest > + * RAM */ > + > + shift = kvmppc_reset_htab(spapr->htab_shift); > + > + if (shift > 0) { > + /* Kernel handles htab, we don't need to allocate one */ > + spapr->htab_shift = shift; > + } else { > + if (!spapr->htab) { > + /* Allocate an htab if we don't yet have one */ > + spapr->htab = qemu_memalign(HTAB_SIZE(spapr), HTAB_SIZE(spapr)); > + } > + > + /* And clear it */ > + memset(spapr->htab, 0, HTAB_SIZE(spapr)); > + } > +} > > - /* flush out the hash table */ > - memset(spapr->htab, 0, spapr->htab_size); > +static void spapr_reset(void *opaque) > +{ > + sPAPREnvironment *spapr = (sPAPREnvironment *)opaque; > > /* Load the fdt */ > spapr_finalize_fdt(spapr, spapr->fdt_addr, spapr->rtas_addr, > @@ -558,8 +585,16 @@ static void spapr_reset(void *opaque) > static void spapr_cpu_reset(void *opaque) > { > PowerPCCPU *cpu = opaque; > + CPUPPCState *env = &cpu->env; > > cpu_reset(CPU(cpu)); > + > + env->external_htab = spapr->htab; > + env->htab_base = -1; > + env->htab_mask = HTAB_SIZE(spapr) - 1; > + > + env->spr[SPR_SDR1] = (unsigned long)spapr->htab | > + (spapr->htab_shift - 18); > } > > static int spapr_vga_init(PCIBus *pci_bus) > @@ -603,7 +638,6 @@ static void ppc_spapr_init(ram_addr_t ram_size, > uint32_t initrd_base = 0; > long kernel_size = 0, initrd_size = 0; > long load_limit, rtas_limit, fw_size; > - long pteg_shift = 17; > char *filename; > > spapr = g_malloc0(sizeof(*spapr)); > @@ -632,6 +666,11 @@ static void ppc_spapr_init(ram_addr_t ram_size, > spapr->fdt_addr = spapr->rtas_addr - FDT_MAX_SIZE; > load_limit = spapr->fdt_addr - FW_OVERHEAD; > > + /* For now, always aim for a 16MB hash table */ > + /* FIXME: we should change this default based on RAM size */ > + spapr->htab_shift = 24; > + qemu_register_reset(spapr_reset_htab, spapr); > + > /* init CPUs */ > if (cpu_model == NULL) { > cpu_model = kvm_enabled() ? "host" : "POWER7"; > @@ -664,20 +703,8 @@ static void ppc_spapr_init(ram_addr_t ram_size, > memory_region_add_subregion(sysmem, nonrma_base, ram); > } > > - /* allocate hash page table. For now we always make this 16mb, > - * later we should probably make it scale to the size of guest > - * RAM */ > - spapr->htab_size = 1ULL << (pteg_shift + 7); > - spapr->htab = qemu_memalign(spapr->htab_size, spapr->htab_size); > - > for (env = first_cpu; env != NULL; env = env->next_cpu) { > - env->external_htab = spapr->htab; > - env->htab_base = -1; > - env->htab_mask = spapr->htab_size - 1; > - > /* Tell KVM that we're in PAPR mode */ > - env->spr[SPR_SDR1] = (unsigned long)spapr->htab | > - ((pteg_shift + 7) - 18); > env->spr[SPR_HIOR] = 0; > > if (kvm_enabled()) { > @@ -816,8 +843,7 @@ static void ppc_spapr_init(ram_addr_t ram_size, > spapr->fdt_skel = spapr_create_fdt_skel(cpu_model, rma_size, > initrd_base, initrd_size, > kernel_size, > - boot_device, kernel_cmdline, > - pteg_shift + 7); > + boot_device, kernel_cmdline); > assert(spapr->fdt_skel != NULL); > > qemu_register_reset(spapr_reset, spapr); > diff --git a/hw/spapr.h b/hw/spapr.h > index 9153f29..7ec4d7c 100644 > --- a/hw/spapr.h > +++ b/hw/spapr.h > @@ -15,7 +15,7 @@ typedef struct sPAPREnvironment { > > target_phys_addr_t ram_limit; > void *htab; > - long htab_size; > + long htab_shift; > target_phys_addr_t fdt_addr, rtas_addr; > long rtas_size; > void *fdt_skel; > diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c > index 829e180..12ae0d7 100644 > --- a/target-ppc/kvm.c > +++ b/target-ppc/kvm.c > @@ -1101,6 +1101,23 @@ int kvmppc_remove_spapr_tce(void *table, int fd, uint32_t window_size) > return 0; > } > > +int kvmppc_reset_htab(int shift_hint) > +{ > + uint32_t shift = shift_hint; > + > + if (kvm_check_extension(kvm_state, KVM_CAP_PPC_ALLOC_HTAB)) { > + int ret; > + ret = kvm_vm_ioctl(kvm_state, KVM_PPC_ALLOCATE_HTAB, &shift); > + if (ret < 0) { > + return ret; > + } > + return shift; > + } > + > + /* For now.. */ > + return 0; > +} > + > static inline uint32_t mfpvr(void) > { > uint32_t pvr; > diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h > index e2f8703..b5be657 100644 > --- a/target-ppc/kvm_ppc.h > +++ b/target-ppc/kvm_ppc.h > @@ -27,6 +27,7 @@ int kvmppc_smt_threads(void); > off_t kvmppc_alloc_rma(const char *name, MemoryRegion *sysmem); > void *kvmppc_create_spapr_tce(uint32_t liobn, uint32_t window_size, int *pfd); > int kvmppc_remove_spapr_tce(void *table, int pfd, uint32_t window_size); > +int kvmppc_reset_htab(int shift_hint); > #endif /* !CONFIG_USER_ONLY */ > const ppc_def_t *kvmppc_host_cpu_def(void); > int kvmppc_fixup_cpu(CPUPPCState *env); > @@ -94,6 +95,12 @@ static inline int kvmppc_remove_spapr_tce(void *table, int pfd, > { > return -1; > } > + > +static inline int kvmppc_reset_htab(int shift_hint) > +{ > + return -1; > +} > + > #endif /* !CONFIG_USER_ONLY */ > > static inline const ppc_def_t *kvmppc_host_cpu_def(void) > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html