On 11/22/2010 05:00 PM, Anthony Liguori wrote:
qemu-kvm vcpu threads don't response to SIGSTOP/SIGCONT. Instead of teaching them to respond to these signals, introduce monitor commands that stop and start individual vcpus. The purpose of these commands are to implement CPU hard limits using an external tool that watches the CPU consumption and stops the CPU as appropriate. The monitor commands provide a more elegant solution that signals because it ensures that a stopped vcpu isn't holding the qemu_mutex. I'll reply to this note with an example tool.
This is super rough but demonstrates the concept. If you run it with '0 50 100' it will cap VCPU 0 at 50%.
It's not the prettiest thing in the world but it's minimally invasive and seems to work well.
Regards, Anthony Liguori
Signed-off-by: Anthony Liguori<aliguori@xxxxxxxxxx> diff --git a/hmp-commands.hx b/hmp-commands.hx index ba6de28..827bd67 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -279,6 +279,24 @@ Resume emulation. ETEXI { + .name = "cpu_start", + .args_type = "cpu:i", + .params = "[cpu]", + .help = "start cpu emulation", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_vcpu_start, + }, + + { + .name = "cpu_stop", + .args_type = "cpu:i", + .params = "[cpu]", + .help = "stop cpu emulation", + .user_print = monitor_user_noop, + .mhandler.cmd_new = do_vcpu_stop, + }, + + { .name = "gdbserver", .args_type = "device:s?", .params = "[device]", diff --git a/qemu-kvm.c b/qemu-kvm.c index 471306b..35121ed 100644 --- a/qemu-kvm.c +++ b/qemu-kvm.c @@ -1351,6 +1351,65 @@ static void pause_all_threads(void) } } +static void vcpu_stop(int cpu) +{ + CPUState *env = first_cpu; + + for (env = first_cpu; env; env = env->next_cpu) { + if (env->cpu_index == cpu) { + break; + } + } + + if (env) { + if (env != cpu_single_env) { + env->stop = 1; + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); + } else { + env->stop = 0; + env->stopped = 1; + cpu_exit(env); + } + + while (!env->stopped) { + qemu_cond_wait(&qemu_pause_cond); + } + } +} + +static void vcpu_start(int cpu) +{ + CPUState *env = first_cpu; + + assert(!cpu_single_env); + + for (env = first_cpu; env; env = env->next_cpu) { + if (env->cpu_index == cpu) { + break; + } + } + + if (env) { + env->stop = 0; + env->stopped = 0; + pthread_kill(env->kvm_cpu_state.thread, SIG_IPI); + } +} + +int do_vcpu_stop(Monitor *mon, const QDict *qdict, QObject **ret_data) +{ + int vcpu = qdict_get_int(qdict, "cpu"); + vcpu_stop(vcpu); + return 0; +} + +int do_vcpu_start(Monitor *mon, const QDict *qdict, QObject **ret_data) +{ + int vcpu = qdict_get_int(qdict, "cpu"); + vcpu_start(vcpu); + return 0; +} + static void resume_all_threads(void) { CPUState *penv = first_cpu; diff --git a/sysemu.h b/sysemu.h index 849dc8c..3ef68dd 100644 --- a/sysemu.h +++ b/sysemu.h @@ -61,6 +61,9 @@ void qemu_system_reset(void); void qemu_add_exit_notifier(Notifier *notify); void qemu_remove_exit_notifier(Notifier *notify); +int do_vcpu_stop(Monitor *mon, const QDict *qdict, QObject **ret_data); +int do_vcpu_start(Monitor *mon, const QDict *qdict, QObject **ret_data); + void do_savevm(Monitor *mon, const QDict *qdict); int load_vmstate(const char *name); void do_delvm(Monitor *mon, const QDict *qdict);
#define _XOPEN_SOURCE 500 #define _GNU_SOURCE #include <stdio.h> #include <fcntl.h> #include <unistd.h> #include <stdlib.h> #include <stdbool.h> #include <sys/types.h> #include <signal.h> #include <sys/time.h> #include <sys/syscall.h> #include <sys/socket.h> #include <sys/un.h> #include <stdarg.h> #define USEC_PER_SEC 1000000ULL static long get_cguest_time(const char *buffer) { const char *ptr; int space_count; for (ptr = buffer; *ptr && space_count != 42; ptr++) { if (*ptr == ' ') { space_count++; } } return strtol(ptr, NULL, 10); } static void tv_add(struct timeval *tv, suseconds_t usec) { tv->tv_usec += usec; while (tv->tv_usec > USEC_PER_SEC) { tv->tv_sec += 1; tv->tv_usec -= USEC_PER_SEC; } } static int tv_cmp(struct timeval *lhs, struct timeval *rhs) { if (lhs->tv_sec == rhs->tv_sec) { if (lhs->tv_usec < rhs->tv_usec) { return -1; } else if (lhs->tv_usec > rhs->tv_usec) { return 1; } return 0; } else if (lhs->tv_sec < rhs->tv_sec) { return -1; } else if (lhs->tv_sec > rhs->tv_sec) { return 1; } return 0; } static void write_all(int fd, const void *buffer, size_t buffer_len) { size_t offset = 0; while (offset < buffer_len) { ssize_t len; len = write(fd, buffer + offset, buffer_len - offset); if (len > 0) { offset += len; } } } static void read_reply(int fd, char *buffer, size_t buffer_len) { size_t offset = 0; while (offset < buffer_len) { ssize_t len; len = read(fd, buffer + offset, buffer_len - offset); if (len > 0) { offset += len; } if (offset > 8 && memcmp("\n(qemu) ", buffer + (offset - 8), 8) == 0) { char *ptr; buffer[offset - 8] = 0; ptr = strchr(buffer, '\n'); if (ptr == NULL) { buffer[0] = 0; } else { memmove(buffer, ptr + 1, offset - (ptr - buffer) - 1); } return; } } } static int monitor_fd; static void monitor_command(const char *fmt, ...) { char buffer[256]; va_list ap; size_t len; va_start(ap, fmt); len = vsnprintf(buffer, sizeof(buffer), fmt, ap); va_end(ap); write_all(monitor_fd, buffer, len); write_all(monitor_fd, "\n", 1); read_reply(monitor_fd, buffer, sizeof(buffer)); } static void monitor_command_response(char *rsp, size_t rsp_len, const char *fmt, ...) { char buffer[256]; va_list ap; size_t len; va_start(ap, fmt); len = vsnprintf(buffer, sizeof(buffer), fmt, ap); va_end(ap); write_all(monitor_fd, buffer, len); write_all(monitor_fd, "\n", 1); read_reply(monitor_fd, rsp, rsp_len); } static int vm_running = 1; static void guest_start(int vcpu) { if (!vm_running) { monitor_command("cpu_start %d", vcpu); } vm_running = 1; } static void guest_stop(int vcpu) { if (vm_running) { monitor_command("cpu_stop %d", vcpu); } vm_running = 0; } static int find_pid(char *buffer, int vcpu) { char *ptr = buffer; int i; for (i = 0; ptr && i < vcpu; i++) { ptr = strchr(ptr, '\n'); if (ptr) { ptr++; } } if (ptr) { ptr = strstr(ptr, "thread_id="); if (ptr) { ptr += 10; return atoi(ptr); } } return 0; } int main(int argc, char **argv) { int fd, pid, vcpu; char buffer[1024]; long ticks_per_sec; long cguest_time_last = 0; struct timeval period_end; long cguest_ticks; long entitlement; long period; struct sockaddr_un addr; if (argc != 4) { fprintf(stderr, "Missing arguments\n"); return 1; } vcpu = atoi(argv[1]); /* FIXME hack, does guest time get scaled with vcpu count? */ entitlement = atoi(argv[2]) * 2; period = atoi(argv[3]); monitor_fd = socket(PF_UNIX, SOCK_STREAM, 0); addr.sun_family = AF_UNIX; snprintf(addr.sun_path, 108, "/tmp/monitor.sock"); if (connect(monitor_fd, (struct sockaddr *)&addr, sizeof(addr)) == -1) { return 1; } read_reply(monitor_fd, buffer, sizeof(buffer)); monitor_command_response(buffer, sizeof(buffer), "info cpus"); pid = find_pid(buffer, vcpu); ticks_per_sec = sysconf(_SC_CLK_TCK); entitlement = (entitlement * ticks_per_sec) / 1000; period *= 1000; snprintf(buffer, sizeof(buffer), "/proc/%d/stat", pid); fd = open(buffer, O_RDONLY); gettimeofday(&period_end, NULL); tv_add(&period_end, period); cguest_ticks = 0; while (1) { long cguest_time_now; struct timeval tv_now; ssize_t len; gettimeofday(&tv_now, NULL); len = pread(fd, buffer, sizeof(buffer) - 1, 0); buffer[len] = 0; cguest_time_now = get_cguest_time(buffer); if (cguest_time_last) { cguest_ticks += cguest_time_now - cguest_time_last; if (tv_cmp(&tv_now, &period_end) < 0) { if (cguest_ticks >= entitlement) { guest_stop(vcpu); cguest_ticks = 0; } } else { guest_start(vcpu); cguest_ticks = 0; tv_add(&tv_now, period); period_end = tv_now; } } cguest_time_last = cguest_time_now; usleep(10000); // 10ms } close(fd); return 0; }