This patch originates in the mp_state writeback issue: During runtime and even on reset, we must not write the previously saved VCPU state back into the kernel in an uncontrolled fashion. E.g mp_state should only written on reset or on VCPU setup. Certain clocks (e.g. the TSC) may only be written on setup or after vmload. By introducing additional information about the context of the planned vcpu state manipulation, we can simply skip sensitive states like mp_state when updating the in-kernel state. The planned modifications are defined when calling cpu_synchronize_state. They accumulate, ie. once a full writeback was requested, it will stick until it was performed. This patch already fixes existing writeback issues in upstream KVM by only selectively writing MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, the mp_state and the vcpu_events. Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> --- This patch is intentionally written against uq/master. As upstream is less convoluted (yet :) ), it may help understanding the basic idea. An add-on patch for qemu-kvm that both cleans up the current code and also moves kvm_get/set_lapic into kvm_arch_get/put_registers (hmm, maybe also renaming that services...) will follow soon if no one sees fundamental problems of this approach. exec.c | 4 ++-- gdbstub.c | 10 +++++----- hw/apic.c | 2 +- hw/ppc_newworld.c | 2 +- hw/ppc_oldworld.c | 2 +- hw/s390-virtio.c | 2 +- kvm-all.c | 31 +++++++++++++++++++------------ kvm.h | 13 +++++++++---- monitor.c | 4 ++-- target-i386/helper.c | 2 +- target-i386/kvm.c | 31 +++++++++++++++++++------------ target-i386/machine.c | 4 ++-- target-ppc/kvm.c | 2 +- target-ppc/machine.c | 4 ++-- target-s390x/kvm.c | 17 ++++++++++++----- 15 files changed, 78 insertions(+), 52 deletions(-) diff --git a/exec.c b/exec.c index 6875370..5a83922 100644 --- a/exec.c +++ b/exec.c @@ -521,14 +521,14 @@ static void cpu_common_pre_save(void *opaque) { CPUState *env = opaque; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); } static int cpu_common_pre_load(void *opaque) { CPUState *env = opaque; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); return 0; } diff --git a/gdbstub.c b/gdbstub.c index 80477be..8caae15 100644 --- a/gdbstub.c +++ b/gdbstub.c @@ -1653,7 +1653,7 @@ static void gdb_breakpoint_remove_all(void) static void gdb_set_cpu_pc(GDBState *s, target_ulong pc) { #if defined(TARGET_I386) - cpu_synchronize_state(s->c_cpu); + cpu_synchronize_state(s->c_cpu, CPU_MODIFY_RUNTIME); s->c_cpu->eip = pc; #elif defined (TARGET_PPC) s->c_cpu->nip = pc; @@ -1678,7 +1678,7 @@ static void gdb_set_cpu_pc(GDBState *s, target_ulong pc) #elif defined (TARGET_ALPHA) s->c_cpu->pc = pc; #elif defined (TARGET_S390X) - cpu_synchronize_state(s->c_cpu); + cpu_synchronize_state(s->c_cpu, CPU_MODIFY_RUNTIME); s->c_cpu->psw.addr = pc; #endif } @@ -1848,7 +1848,7 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf) } break; case 'g': - cpu_synchronize_state(s->g_cpu); + cpu_synchronize_state(s->g_cpu, CPU_MODIFY_NONE); len = 0; for (addr = 0; addr < num_g_regs; addr++) { reg_size = gdb_read_register(s->g_cpu, mem_buf + len, addr); @@ -1858,7 +1858,7 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf) put_packet(s, buf); break; case 'G': - cpu_synchronize_state(s->g_cpu); + cpu_synchronize_state(s->g_cpu, CPU_MODIFY_RUNTIME); registers = mem_buf; len = strlen(p) / 2; hextomem((uint8_t *)registers, p, len); @@ -2022,7 +2022,7 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf) thread = strtoull(p+16, (char **)&p, 16); env = find_cpu(thread); if (env != NULL) { - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); len = snprintf((char *)mem_buf, sizeof(mem_buf), "CPU#%d [%s]", env->cpu_index, env->halted ? "halted " : "running"); diff --git a/hw/apic.c b/hw/apic.c index 87e7dc0..3562ad5 100644 --- a/hw/apic.c +++ b/hw/apic.c @@ -938,7 +938,7 @@ static void apic_reset(void *opaque) APICState *s = opaque; int bsp; - cpu_synchronize_state(s->cpu_env); + cpu_synchronize_state(s->cpu_env, CPU_MODIFY_RESET); bsp = cpu_is_bsp(s->cpu_env); s->apicbase = 0xfee00000 | diff --git a/hw/ppc_newworld.c b/hw/ppc_newworld.c index a4c714a..6e33c47 100644 --- a/hw/ppc_newworld.c +++ b/hw/ppc_newworld.c @@ -140,7 +140,7 @@ static void ppc_core99_init (ram_addr_t ram_size, } /* Make sure all register sets take effect */ - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); /* allocate RAM */ ram_offset = qemu_ram_alloc(ram_size); diff --git a/hw/ppc_oldworld.c b/hw/ppc_oldworld.c index 7ccc6a1..ba2b6c3 100644 --- a/hw/ppc_oldworld.c +++ b/hw/ppc_oldworld.c @@ -165,7 +165,7 @@ static void ppc_heathrow_init (ram_addr_t ram_size, } /* Make sure all register sets take effect */ - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); /* allocate RAM */ if (ram_size > (2047 << 20)) { diff --git a/hw/s390-virtio.c b/hw/s390-virtio.c index 3582728..d7577dd 100644 --- a/hw/s390-virtio.c +++ b/hw/s390-virtio.c @@ -185,7 +185,7 @@ static void s390_init(ram_addr_t ram_size, exit(1); } - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); env->psw.addr = KERN_IMAGE_START; env->psw.mask = 0x0000000180000000ULL; } diff --git a/kvm-all.c b/kvm-all.c index f8350c9..8595cd9 100644 --- a/kvm-all.c +++ b/kvm-all.c @@ -57,7 +57,8 @@ struct KVMState KVMSlot slots[32]; int fd; int vmfd; - int regs_modified; + int synchronized; + int pending_modifications; int coalesced_mmio; #ifdef KVM_CAP_COALESCED_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; @@ -155,10 +156,12 @@ static void kvm_reset_vcpu(void *opaque) CPUState *env = opaque; kvm_arch_reset_vcpu(env); - if (kvm_arch_put_registers(env)) { + if (kvm_arch_put_registers(env, env->kvm_state->pending_modifications + | CPU_MODIFY_RESET)) { fprintf(stderr, "Fatal: kvm vcpu reset failed\n"); abort(); } + env->kvm_state->pending_modifications = CPU_MODIFY_NONE; } int kvm_irqchip_in_kernel(void) @@ -213,7 +216,8 @@ int kvm_init_vcpu(CPUState *env) if (ret == 0) { qemu_register_reset(kvm_reset_vcpu, env); kvm_arch_reset_vcpu(env); - ret = kvm_arch_put_registers(env); + ret = kvm_arch_put_registers(env, CPU_MODIFY_INIT); + env->kvm_state->pending_modifications = CPU_MODIFY_NONE; } err: return ret; @@ -572,12 +576,13 @@ void kvm_flush_coalesced_mmio_buffer(void) #endif } -void kvm_cpu_synchronize_state(CPUState *env) +void kvm_cpu_synchronize_state(CPUState *env, int modifications) { - if (!env->kvm_state->regs_modified) { + if (!env->kvm_state->synchronized) { kvm_arch_get_registers(env); - env->kvm_state->regs_modified = 1; + env->kvm_state->synchronized = 1; } + env->kvm_state->pending_modifications |= modifications; } int kvm_cpu_exec(CPUState *env) @@ -594,9 +599,9 @@ int kvm_cpu_exec(CPUState *env) break; } - if (env->kvm_state->regs_modified) { - kvm_arch_put_registers(env); - env->kvm_state->regs_modified = 0; + if (env->kvm_state->pending_modifications) { + kvm_arch_put_registers(env, env->kvm_state->pending_modifications); + env->kvm_state->pending_modifications = CPU_MODIFY_NONE; } kvm_arch_pre_run(env, run); @@ -605,6 +610,8 @@ int kvm_cpu_exec(CPUState *env) qemu_mutex_lock_iothread(); kvm_arch_post_run(env, run); + env->kvm_state->synchronized = 0; + if (ret == -EINTR || ret == -EAGAIN) { dprintf("io window exit\n"); ret = 0; @@ -946,9 +953,9 @@ static void kvm_invoke_set_guest_debug(void *data) struct kvm_set_guest_debug_data *dbg_data = data; CPUState *env = dbg_data->env; - if (env->kvm_state->regs_modified) { - kvm_arch_put_registers(env); - env->kvm_state->regs_modified = 0; + if (env->kvm_state->pending_modifications) { + kvm_arch_put_registers(env, env->kvm_state->pending_modifications); + env->kvm_state->pending_modifications = CPU_MODIFY_NONE; } dbg_data->err = kvm_vcpu_ioctl(env, KVM_SET_GUEST_DEBUG, &dbg_data->dbg); } diff --git a/kvm.h b/kvm.h index 59cba18..38d0285 100644 --- a/kvm.h +++ b/kvm.h @@ -86,7 +86,7 @@ int kvm_arch_pre_run(CPUState *env, struct kvm_run *run); int kvm_arch_get_registers(CPUState *env); -int kvm_arch_put_registers(CPUState *env); +int kvm_arch_put_registers(CPUState *env, int modifications); int kvm_arch_init(KVMState *s, int smp_cpus); @@ -129,14 +129,19 @@ int kvm_check_extension(KVMState *s, unsigned int extension); uint32_t kvm_arch_get_supported_cpuid(CPUState *env, uint32_t function, int reg); -void kvm_cpu_synchronize_state(CPUState *env); +void kvm_cpu_synchronize_state(CPUState *env, int modifications); /* generic hooks - to be moved/refactored once there are more users */ -static inline void cpu_synchronize_state(CPUState *env) +#define CPU_MODIFY_NONE 0 +#define CPU_MODIFY_RUNTIME (1 << 0) +#define CPU_MODIFY_RESET ((1 << 1) | CPU_MODIFY_RUNTIME) +#define CPU_MODIFY_INIT ((1 << 2) | CPU_MODIFY_RESET) + +static inline void cpu_synchronize_state(CPUState *env, int modifications) { if (kvm_enabled()) { - kvm_cpu_synchronize_state(env); + kvm_cpu_synchronize_state(env, modifications); } } diff --git a/monitor.c b/monitor.c index cadf422..833191c 100644 --- a/monitor.c +++ b/monitor.c @@ -684,7 +684,7 @@ static CPUState *mon_get_cpu(void) if (!cur_mon->mon_cpu) { mon_set_cpu(0); } - cpu_synchronize_state(cur_mon->mon_cpu); + cpu_synchronize_state(cur_mon->mon_cpu, CPU_MODIFY_NONE); return cur_mon->mon_cpu; } @@ -783,7 +783,7 @@ static void do_info_cpus(Monitor *mon, QObject **ret_data) QDict *cpu; QObject *obj; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); obj = qobject_from_jsonf("{ 'CPU': %d, 'current': %i, 'halted': %i }", env->cpu_index, env == mon->mon_cpu, diff --git a/target-i386/helper.c b/target-i386/helper.c index 70762bb..31b78ac 100644 --- a/target-i386/helper.c +++ b/target-i386/helper.c @@ -766,7 +766,7 @@ void cpu_dump_state(CPUState *env, FILE *f, char cc_op_name[32]; static const char *seg_name[6] = { "ES", "CS", "SS", "DS", "FS", "GS" }; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); eflags = env->eflags; #ifdef TARGET_X86_64 diff --git a/target-i386/kvm.c b/target-i386/kvm.c index 5b093ce..a096b30 100644 --- a/target-i386/kvm.c +++ b/target-i386/kvm.c @@ -534,7 +534,7 @@ static void kvm_msr_entry_set(struct kvm_msr_entry *entry, entry->data = value; } -static int kvm_put_msrs(CPUState *env) +static int kvm_put_msrs(CPUState *env, int modifications) { struct { struct kvm_msrs info; @@ -548,7 +548,9 @@ static int kvm_put_msrs(CPUState *env) kvm_msr_entry_set(&msrs[n++], MSR_IA32_SYSENTER_EIP, env->sysenter_eip); if (kvm_has_msr_star(env)) kvm_msr_entry_set(&msrs[n++], MSR_STAR, env->star); - kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); + if (modifications & CPU_MODIFY_INIT) { + kvm_msr_entry_set(&msrs[n++], MSR_IA32_TSC, env->tsc); + } #ifdef TARGET_X86_64 /* FIXME if lm capable */ kvm_msr_entry_set(&msrs[n++], MSR_CSTAR, env->cstar); @@ -556,8 +558,11 @@ static int kvm_put_msrs(CPUState *env) kvm_msr_entry_set(&msrs[n++], MSR_FMASK, env->fmask); kvm_msr_entry_set(&msrs[n++], MSR_LSTAR, env->lstar); #endif - kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME, env->system_time_msr); - kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr); + if (modifications & CPU_MODIFY_INIT) { + kvm_msr_entry_set(&msrs[n++], MSR_KVM_SYSTEM_TIME, + env->system_time_msr); + kvm_msr_entry_set(&msrs[n++], MSR_KVM_WALL_CLOCK, env->wall_clock_msr); + } msr_data.info.nmsrs = n; @@ -837,7 +842,7 @@ static int kvm_get_vcpu_events(CPUState *env) return 0; } -int kvm_arch_put_registers(CPUState *env) +int kvm_arch_put_registers(CPUState *env, int modifications) { int ret; @@ -853,17 +858,19 @@ int kvm_arch_put_registers(CPUState *env) if (ret < 0) return ret; - ret = kvm_put_msrs(env); + ret = kvm_put_msrs(env, modifications); if (ret < 0) return ret; - ret = kvm_put_mp_state(env); - if (ret < 0) - return ret; + if (modifications & CPU_MODIFY_RESET) { + ret = kvm_put_mp_state(env); + if (ret < 0) + return ret; - ret = kvm_put_vcpu_events(env); - if (ret < 0) - return ret; + ret = kvm_put_vcpu_events(env); + if (ret < 0) + return ret; + } return 0; } diff --git a/target-i386/machine.c b/target-i386/machine.c index 8770491..27d0b77 100644 --- a/target-i386/machine.c +++ b/target-i386/machine.c @@ -321,7 +321,7 @@ static void cpu_pre_save(void *opaque) CPUState *env = opaque; int i; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); /* FPU */ env->fpus_vmstate = (env->fpus & ~0x3800) | (env->fpstt & 0x7) << 11; @@ -341,7 +341,7 @@ static int cpu_pre_load(void *opaque) { CPUState *env = opaque; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); return 0; } diff --git a/target-ppc/kvm.c b/target-ppc/kvm.c index 0424a78..53aaf02 100644 --- a/target-ppc/kvm.c +++ b/target-ppc/kvm.c @@ -57,7 +57,7 @@ void kvm_arch_reset_vcpu(CPUState *env) { } -int kvm_arch_put_registers(CPUState *env) +int kvm_arch_put_registers(CPUState *env, int modifications) { struct kvm_regs regs; int ret; diff --git a/target-ppc/machine.c b/target-ppc/machine.c index 4897c8a..a0f24fb 100644 --- a/target-ppc/machine.c +++ b/target-ppc/machine.c @@ -7,7 +7,7 @@ void cpu_save(QEMUFile *f, void *opaque) CPUState *env = (CPUState *)opaque; unsigned int i, j; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_NONE); for (i = 0; i < 32; i++) qemu_put_betls(f, &env->gpr[i]); @@ -96,7 +96,7 @@ int cpu_load(QEMUFile *f, void *opaque, int version_id) CPUState *env = (CPUState *)opaque; unsigned int i, j; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_INIT); for (i = 0; i < 32; i++) qemu_get_betls(f, &env->gpr[i]); diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c index 0992563..f737b1c 100644 --- a/target-s390x/kvm.c +++ b/target-s390x/kvm.c @@ -91,7 +91,7 @@ void kvm_arch_reset_vcpu(CPUState *env) /* FIXME: add code to reset vcpu. */ } -int kvm_arch_put_registers(CPUState *env) +int kvm_arch_put_registers(CPUState *env, int modifications) { struct kvm_regs regs; int ret; @@ -235,7 +235,7 @@ static int sclp_service_call(CPUState *env, struct kvm_run *run, uint16_t ipbh0) uint64_t code; int r = 0; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_RUNTIME); sccb = env->regs[ipbh0 & 0xf]; code = env->regs[(ipbh0 & 0xf0) >> 4]; @@ -294,9 +294,16 @@ static int handle_hypercall(CPUState *env, struct kvm_run *run) { int r; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_RUNTIME); r = s390_virtio_hypercall(env); - kvm_arch_put_registers(env); + + /* + * FIXME: The following two lines look misplaced. They should be redundant + * because of automatic writeback on guest resume. + */ + kvm_arch_put_registers(env, env->kvm_state->pending_modifications + | CPU_MODIFY_RUNTIME); + env->kvm_state->pending_modifications = CPU_MODIFY_NONE; return r; } @@ -354,7 +361,7 @@ static int handle_sigp(CPUState *env, struct kvm_run *run, uint8_t ipa1) int r = -1; CPUState *target_env; - cpu_synchronize_state(env); + cpu_synchronize_state(env, CPU_MODIFY_RUNTIME); /* get order code */ order_code = run->s390_sieic.ipb >> 28; -- 1.6.0.2 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html