qemu-kvm vcpu threads don't response to SIGSTOP/SIGCONT. Instead of teaching them to respond to these signals (which cannot be trapped), use SIGUSR1 to approximate the behavior of SIGSTOP/SIGCONT. The purpose of this is to implement CPU hard limits using an external tool that watches the CPU consumption and stops the VCPU as appropriate. This provides a more elegant solution in that it allows the VCPU thread to release qemu_mutex before going to sleep. This current implementation uses a single signal. I think this is too racey in the long term so I think we should introduce a second signal. If two signals get coalesced into one, it could confuse the monitoring tool into giving the VCPU the inverse of it's entitlement. It might be better to simply move this logic entirely into QEMU to make this more robust--the question is whether we think this is a good long term feature to carry in QEMU? Signed-off-by: Anthony Liguori <aliguori@xxxxxxxxxx> diff --git a/cpu-defs.h b/cpu-defs.h index 51533c6..6434dca 100644 --- a/cpu-defs.h +++ b/cpu-defs.h @@ -220,6 +220,7 @@ struct KVMCPUState { const char *cpu_model_str; \ struct KVMState *kvm_state; \ struct kvm_run *kvm_run; \ + int sigusr1_fd; \ int kvm_fd; \ int kvm_vcpu_dirty; \ struct KVMCPUState kvm_cpu_state; diff --git a/qemu-kvm.c b/qemu-kvm.c index 471306b..354109f 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -1351,6 +1351,29 @@ static void pause_all_threads(void) } } +static void vcpu_stop(CPUState *env) +{ + if (env != cpu_single_env) { + env->stop = 1; + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); + } else { + env->stop = 0; + env->stopped = 1; + cpu_exit(env); + } + + while (!env->stopped) { + qemu_cond_wait(&qemu_pause_cond); + } +} + +static void vcpu_start(CPUState *env) +{ + env->stop = 0; + env->stopped = 0; + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); +} + static void resume_all_threads(void) { CPUState *penv = first_cpu; @@ -1426,6 +1449,37 @@ static int kvm_main_loop_cpu(CPUState *env) return 0; } +static __thread int sigusr1_wfd; + +static void on_sigusr1(int signo) +{ + char ch = 0; + if (write(sigusr1_wfd, &ch, 1) < 0) { + /* who cares */ + } +} + +static void sigusr1_read(void *opaque) +{ + CPUState *env = opaque; + ssize_t len; + int caught_signal = 0; + + do { + char buffer[256]; + len = read(env->sigusr1_fd, buffer, sizeof(buffer)); + caught_signal = 1; + } while (len > 0); + + if (caught_signal) { + if (env->stopped) { + vcpu_start(env); + } else { + vcpu_stop(env); + } + } +} + static void *ap_main_loop(void *_env) { CPUState *env = _env; @@ -1433,10 +1487,12 @@ static void *ap_main_loop(void *_env) #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT struct ioperm_data *data = NULL; #endif + int fds[2]; current_env = env; env->thread_id = kvm_get_thread_id(); sigfillset(&signals); + sigdelset(&signals, SIGUSR1); sigprocmask(SIG_BLOCK, &signals, NULL); #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT @@ -1451,6 +1507,18 @@ static void *ap_main_loop(void *_env) kvm_create_vcpu(env, env->cpu_index); setup_kernel_sigmask(env); + if (pipe(fds) == -1) { + /* do nothing */ + } + + fcntl(fds[0], F_SETFL, O_NONBLOCK); + fcntl(fds[1], F_SETFL, O_NONBLOCK); + + env->sigusr1_fd = fds[0]; + sigusr1_wfd = fds[1]; + + qemu_set_fd_handler2(fds[0], NULL, sigusr1_read, NULL, env); + /* signal VCPU creation */ current_env->created = 1; pthread_cond_signal(&qemu_vcpu_cond); @@ -1463,6 +1531,8 @@ static void *ap_main_loop(void *_env) /* re-initialize cpu_single_env after re-acquiring qemu_mutex */ cpu_single_env = env; + signal(SIGUSR1, on_sigusr1); + kvm_main_loop_cpu(env); return NULL; } diff --git a/qemu-kvm.h b/qemu-kvm.h index 0f3fb50..3addc77 100644 --- a/qemu-kvm.h +++ b/qemu-kvm.h @@ -783,6 +783,7 @@ struct KVMState { int irqchip_in_kernel; int pit_in_kernel; int xsave, xcrs; + int sigusr2_fd; struct kvm_context kvm_context; }; -- 1.7.0.4 -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html