Right now, we issue cpu creation from the i/o thread, and then shoot a thread from inside that code. Over the last months, a lot of subtle bugs were reported, usually arising from the very fragile order of that initialization. I propose we rethink that a little. This is a patch that received basic testing only, and I'd like to hear on the overall direction. The idea is to issue the new thread as early as possible. The first direct benefits I can identify are that we no longer have to rely at on_vcpu-like schemes for issuing vcpu ioctls, since we are already on the right thread. Apic creation has far less spots for race conditions as well. I am implementing this on qemu-kvm first, since we can show the benefits of it a bit better in there (since we already support smp) Let me know what you guys think Signed-off-by: Glauber Costa <glommer@xxxxxxxxxx> CC: Marcelo Tosatti <mtosatti@xxxxxxxxxx> CC: Avi Kivity <avi@xxxxxxxxxx> CC: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> CC: Anthony Liguori <aliguori@xxxxxxxxxx> --- cpu-defs.h | 2 +- hw/acpi.c | 2 +- hw/pc.c | 26 ++++++++++++-------------- hw/pc.h | 2 +- qemu-kvm-x86.c | 5 +++++ qemu-kvm.c | 44 +++++++++++++++++++++++++++++++------------- qemu-kvm.h | 2 ++ vl.c | 2 -- 8 files changed, 53 insertions(+), 32 deletions(-) diff --git a/cpu-defs.h b/cpu-defs.h index cf502e9..6d026e0 100644 --- a/cpu-defs.h +++ b/cpu-defs.h @@ -139,7 +139,7 @@ typedef struct CPUWatchpoint { struct qemu_work_item; struct KVMCPUState { - pthread_t thread; + pthread_t *thread; int signalled; struct qemu_work_item *queued_work_first, *queued_work_last; int regs_modified; diff --git a/hw/acpi.c b/hw/acpi.c index 7564abf..cc68188 100644 --- a/hw/acpi.c +++ b/hw/acpi.c @@ -781,7 +781,7 @@ void qemu_system_cpu_hot_add(int cpu, int state) CPUState *env; if (state && !qemu_get_cpu(cpu)) { - env = pc_new_cpu(model); + pc_new_cpu(model, &env); if (!env) { fprintf(stderr, "cpu %d creation failed\n", cpu); return; diff --git a/hw/pc.c b/hw/pc.c index 83012a9..53e7273 100644 --- a/hw/pc.c +++ b/hw/pc.c @@ -1013,29 +1013,26 @@ int cpu_is_bsp(CPUState *env) return env->cpuid_apic_id == 0; } -CPUState *pc_new_cpu(const char *cpu_model) +void pc_new_cpu(const char *cpu_model, CPUState **env) { - CPUState *env; - - env = cpu_init(cpu_model); - if (!env) { + *env = cpu_init(cpu_model); + if (!*env) { fprintf(stderr, "Unable to find x86 CPU definition\n"); exit(1); } - env->kvm_cpu_state.regs_modified = 1; - if ((env->cpuid_features & CPUID_APIC) || smp_cpus > 1) { - env->cpuid_apic_id = env->cpu_index; + (*env)->kvm_cpu_state.regs_modified = 1; + if (((*env)->cpuid_features & CPUID_APIC) || smp_cpus > 1) { + (*env)->cpuid_apic_id = (*env)->cpu_index; /* APIC reset callback resets cpu */ - apic_init(env); + apic_init(*env); } else { - qemu_register_reset((QEMUResetHandler*)cpu_reset, env); + qemu_register_reset((QEMUResetHandler*)cpu_reset, *env); } /* kvm needs this to run after the apic is initialized. Otherwise, * it can access invalid state and crash. */ - qemu_init_vcpu(env); - return env; + qemu_init_vcpu(*env); } /* PC hardware initialisation */ @@ -1055,7 +1052,6 @@ static void pc_init1(ram_addr_t ram_size, PCIBus *pci_bus; ISADevice *isa_dev; int piix3_devfn = -1; - CPUState *env; qemu_irq *cpu_irq; qemu_irq *isa_irq; qemu_irq *i8259; @@ -1086,8 +1082,10 @@ static void pc_init1(ram_addr_t ram_size, if (kvm_enabled()) { kvm_set_boot_cpu_id(0); } + for (i = 0; i < smp_cpus; i++) { - env = pc_new_cpu(cpu_model); +// pc_new_cpu(cpu_model, &env); + kvm_init_vcpu(NULL); } vmport_init(); diff --git a/hw/pc.h b/hw/pc.h index 93eb34d..f931380 100644 --- a/hw/pc.h +++ b/hw/pc.h @@ -100,7 +100,7 @@ extern int fd_bootchk; void ioport_set_a20(int enable); int ioport_get_a20(void); -CPUState *pc_new_cpu(const char *cpu_model); +void pc_new_cpu(const char *cpu_model, CPUState **env); /* acpi.c */ extern int acpi_enabled; diff --git a/qemu-kvm-x86.c b/qemu-kvm-x86.c index 0d263ca..4084312 100644 --- a/qemu-kvm-x86.c +++ b/qemu-kvm-x86.c @@ -160,6 +160,11 @@ int kvm_arch_create(kvm_context_t kvm, unsigned long phys_mem_bytes, return 0; } +void kvm_arch_create_vcpu(const char *model, CPUState **env) +{ + pc_new_cpu("qemu64", env); +} + #ifdef KVM_EXIT_TPR_ACCESS static int kvm_handle_tpr_access(CPUState *env) diff --git a/qemu-kvm.c b/qemu-kvm.c index b58a457..f83d19a 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -436,10 +436,18 @@ void kvm_disable_pit_creation(kvm_context_t kvm) kvm->no_pit_creation = 1; } -static void kvm_create_vcpu(CPUState *env, int id) + +static void kvm_create_vcpu(CPUState **_env) { long mmap_size; int r; + int id; + CPUState *env; + + kvm_arch_create_vcpu("qemu64", _env); + env = *(CPUState **)_env; + + id = env->cpu_index; r = kvm_vm_ioctl(kvm_state, KVM_CREATE_VCPU, id); if (r < 0) { @@ -1549,11 +1557,13 @@ static void sigbus_handler(int n, struct qemu_signalfd_siginfo *siginfo, static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) { struct qemu_work_item wi; + static int remotes; if (env == current_env) { func(data); return; } + printf("remotes: %d\n", remotes++); wi.func = func; wi.data = data; @@ -1565,7 +1575,7 @@ static void on_vcpu(CPUState *env, void (*func)(void *data), void *data) wi.next = NULL; wi.done = false; - pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); + pthread_kill(*env->kvm_cpu_state.thread, SIG_IPI); while (!wi.done) qemu_cond_wait(&qemu_work_cond); } @@ -1616,8 +1626,8 @@ void kvm_update_interrupt_request(CPUState *env) if (signal) { env->kvm_cpu_state.signalled = 1; - if (env->kvm_cpu_state.thread) - pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); + if (*env->kvm_cpu_state.thread) + pthread_kill(*env->kvm_cpu_state.thread, SIG_IPI); } } } @@ -1840,7 +1850,7 @@ static void pause_all_threads(void) while (penv) { if (penv != cpu_single_env) { penv->stop = 1; - pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI); + pthread_kill(*penv->kvm_cpu_state.thread, SIG_IPI); } else { penv->stop = 0; penv->stopped = 1; @@ -1862,7 +1872,7 @@ static void resume_all_threads(void) while (penv) { penv->stop = 0; penv->stopped = 0; - pthread_kill(penv->kvm_cpu_state.thread, SIG_IPI); + pthread_kill(*penv->kvm_cpu_state.thread, SIG_IPI); penv = (CPUState *) penv->next_cpu; } } @@ -1934,19 +1944,23 @@ static int kvm_main_loop_cpu(CPUState *env) return 0; } +extern void pc_new_cpu(const char *c, CPUState **env); + static void *ap_main_loop(void *_env) { - CPUState *env = _env; + CPUState *env; sigset_t signals; #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT struct ioperm_data *data = NULL; #endif - current_env = env; - env->thread_id = kvm_get_thread_id(); sigfillset(&signals); sigprocmask(SIG_BLOCK, &signals, NULL); - kvm_create_vcpu(env, env->cpu_index); + kvm_create_vcpu(&env); + + *(CPUState **)_env = env; + current_env = env; + env->thread_id = kvm_get_thread_id(); #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT /* do ioperm for io ports of assigned devices */ @@ -1981,12 +1995,16 @@ static void *ap_main_loop(void *_env) return NULL; } -void kvm_init_vcpu(CPUState *env) +void kvm_init_vcpu(CPUState *_env) { - pthread_create(&env->kvm_cpu_state.thread, NULL, ap_main_loop, env); + CPUState *env = NULL; + pthread_t *thread = qemu_malloc(sizeof(*thread)); + + pthread_create(thread, NULL, ap_main_loop, &env); - while (env->created == 0) + while (env == NULL) qemu_cond_wait(&qemu_vcpu_cond); + env->kvm_cpu_state.thread = thread; } int kvm_vcpu_inited(CPUState *env) diff --git a/qemu-kvm.h b/qemu-kvm.h index 525d78d..4dd50f8 100644 --- a/qemu-kvm.h +++ b/qemu-kvm.h @@ -87,6 +87,8 @@ int kvm_alloc_userspace_memory(kvm_context_t kvm, unsigned long memory, int kvm_arch_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem); +void kvm_arch_create_vcpu(const char *model, CPUState **env); + int kvm_arch_run(CPUState *env); diff --git a/vl.c b/vl.c index 5dc7b2b..8dfae43 100644 --- a/vl.c +++ b/vl.c @@ -3570,8 +3570,6 @@ void qemu_init_vcpu(void *_env) { CPUState *env = _env; - if (kvm_enabled()) - kvm_init_vcpu(env); env->nr_cores = smp_cores; env->nr_threads = smp_threads; return; -- 1.6.2.5 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html