experimental code part 3 (qemu userspace) ----------------------------------------- This code utlizes the new ioctl commands introduced by code part 2. The KVM_CREATE_PMTMR ioctl command is simply called once when a virtual machine is being created. However, calling KVM_CONFIGURE_PMTMR is more challenging because it involves ... - passing the base address of PM I/O port range to code part 1 - passing the clock offset to code part 1 'timers_state.cpu_clock_offset' gets updated at each vm_start() call. However, the PM I/O port base address is not available at the first vm_start() call. So, configuring the in-kernel PM Timer needs to be postponed until the PIIX4 PCI configuration is initialized. This is facilitated by the new function kvm_pmtmr_handler() which is called by vm_start() and by pm_io_space_update(). kvm_pmtmr_handler() calls architecture-specific code thru a function pointer 'kvm_arch_pmtmr_handler'. kvm_pmtmr_handler() is a 'no-op' if an architecture does not provide or clears this function pointer. The architecture-specific code is responsible for configuring the in-kernel PM Timer. The experimental code provides kvm_arch_configure_pmtmr_wrapper() in qemu-kvm-x86.c. kvm_arch_create_pmtmr() sets 'kvm_arch_pmtmr_handler' to 'kvm_arch_configure_pmtmr_wrapper' after successful completion of the KVM_CREATE_PMTMR ioctl command. kvm_arch_configure_pmtmr_wrapper() requires ACPI PM code to provide a function pointer 'kvm_arch_get_pm_io_base' thru which the PM I/O port base address can be obtained. kvm_arch_configure_pmtmr_wrapper() is a 'no-op' too if ACPI PM code does not provide or clears this function pointer. The experimental code provides piix4_get_pm_io_base() in hw/acpi_piix4.c. pm_io_space_update() sets 'kvm_arch_get_pm_io_base' to 'piix4_get_pm_io_base'. Consider two scenarios ... - during virtual machine creation and startup kvm_arch_create kvm_arch_create_pmtmr ioctl(KVM_CREATE_PMTMR) kvm_arch_pmtmr_handler = kvm_arch_configure_pmtmr_wrapper : vm_start kvm_pmtmr_handler kvm_arch_configure_pmtmr_wrapper 'no-op' because kvm_arch_get_pm_io_base not set yet : pm_io_space_update kvm_arch_get_pm_io_base = piix4_get_pm_io_base kvm_pmtmr_handler kvm_arch_configure_pmtmr_wrapper obtain PM I/O port base thru kvm_arch_get_pm_io_base kvm_arch_configure_pmtmr ioctl(KVM_CONFIGURE_PMTMR) - any other vm_start() call, for example after migration vm_start kvm_pmtmr_handler kvm_arch_configure_pmtmr_wrapper obtain PM I/O port base thru kvm_arch_get_pm_io_base kvm_arch_configure_pmtmr ioctl(KVM_CONFIGURE_PMTMR) diff -up ./hw/acpi_piix4.c.orig3 ./hw/acpi_piix4.c --- ./hw/acpi_piix4.c.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./hw/acpi_piix4.c 2010-12-10 11:26:53.943753235 +0100 @@ -23,6 +23,7 @@ #include "acpi.h" #include "sysemu.h" #include "range.h" +#include "qemu-kvm.h" //#define DEBUG @@ -80,6 +81,9 @@ typedef struct PIIX4PMState { static void piix4_acpi_system_hot_add_init(PCIBus *bus, PIIX4PMState *s); +/* for cpu hotadd (and in-kernel PM Timer if KVM_CAP_PMTMR is defined) */ +static PIIX4PMState *global_piix4_pm_state; + #define ACPI_ENABLE 0xf1 #define ACPI_DISABLE 0xf0 @@ -250,6 +254,19 @@ static void acpi_dbg_writel(void *opaque PIIX4_DPRINTF("ACPI: DBG: 0x%08x\n", val); } +#ifdef KVM_CAP_PMTMR +static uint64_t piix4_get_pm_io_base(void) +{ + PIIX4PMState *s = global_piix4_pm_state; + uint32_t pm_io_base; + + pm_io_base = le32_to_cpu(*(uint32_t *)(s->dev.config + 0x40)); + pm_io_base &= 0xffc0; + + return (uint64_t)pm_io_base; +} +#endif + static void pm_io_space_update(PIIX4PMState *s) { uint32_t pm_io_base; @@ -262,6 +279,16 @@ static void pm_io_space_update(PIIX4PMSt PIIX4_DPRINTF("PM: mapping to 0x%x\n", pm_io_base); iorange_init(&s->ioport, &pm_iorange_ops, pm_io_base, 64); ioport_register(&s->ioport); +#ifdef KVM_CAP_PMTMR + kvm_arch_get_pm_io_base = piix4_get_pm_io_base; + /* + * The base address of the PM I/O port address range is now known. + * The following call is needed to pass the base address to the + * in-kernel PM Timer emulation. Note that 'kvm_arch_get_pm_io_base' + * must be set _before_ this call. + */ + kvm_pmtmr_handler(); +#endif } } @@ -354,14 +381,12 @@ static void piix4_powerdown(void *opaque } } -static PIIX4PMState *global_piix4_pm_state; /* cpu hotadd */ - static int piix4_pm_initfn(PCIDevice *dev) { PIIX4PMState *s = DO_UPCAST(PIIX4PMState, dev, dev); uint8_t *pci_conf; - /* for cpu hotadd */ + /* for cpu hotadd and in-kernel PM Timer */ global_piix4_pm_state = s; pci_conf = s->dev.config; diff -up ./kvm/include/linux/kvm.h.orig3 ./kvm/include/linux/kvm.h --- ./kvm/include/linux/kvm.h.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./kvm/include/linux/kvm.h 2010-12-10 10:00:11.646936579 +0100 @@ -140,6 +140,12 @@ struct kvm_pit_config { __u32 pad[15]; }; +/* for KVM_CONFIGURE_PMTMR */ +struct kvm_pmtmr_config { + __u64 pm_io_base; + __s64 clock_offset; +}; + #define KVM_PIT_SPEAKER_DUMMY 1 #define KVM_EXIT_UNKNOWN 0 @@ -530,6 +536,9 @@ struct kvm_enable_cap { #ifdef __KVM_HAVE_XCRS #define KVM_CAP_XCRS 56 #endif +#ifdef __KVM_HAVE_PMTMR +#define KVM_CAP_PMTMR 60 +#endif #ifdef KVM_CAP_IRQ_ROUTING @@ -660,6 +669,8 @@ struct kvm_clock_data { #define KVM_XEN_HVM_CONFIG _IOW(KVMIO, 0x7a, struct kvm_xen_hvm_config) #define KVM_SET_CLOCK _IOW(KVMIO, 0x7b, struct kvm_clock_data) #define KVM_GET_CLOCK _IOR(KVMIO, 0x7c, struct kvm_clock_data) +#define KVM_CREATE_PMTMR _IO(KVMIO, 0x7d) +#define KVM_CONFIGURE_PMTMR _IOW(KVMIO, 0x7e, struct kvm_pmtmr_config) /* Available with KVM_CAP_PIT_STATE2 */ #define KVM_GET_PIT2 _IOR(KVMIO, 0x9f, struct kvm_pit_state2) #define KVM_SET_PIT2 _IOW(KVMIO, 0xa0, struct kvm_pit_state2) diff -up ./kvm/include/x86/asm/kvm.h.orig3 ./kvm/include/x86/asm/kvm.h --- ./kvm/include/x86/asm/kvm.h.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./kvm/include/x86/asm/kvm.h 2010-12-10 11:29:56.410873314 +0100 @@ -24,6 +24,7 @@ #define __KVM_HAVE_DEBUGREGS #define __KVM_HAVE_XSAVE #define __KVM_HAVE_XCRS +#define __KVM_HAVE_PMTMR /* Architectural interrupt line count. */ #define KVM_NR_INTERRUPTS 256 diff -up ./qemu-kvm.c.orig3 ./qemu-kvm.c --- ./qemu-kvm.c.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./qemu-kvm.c 2010-12-10 10:50:42.857811776 +0100 @@ -185,6 +185,9 @@ int kvm_init(int smp_cpus) kvm_context->dirty_pages_log_all = 0; kvm_context->no_irqchip_creation = 0; kvm_context->no_pit_creation = 0; +#ifdef KVM_CAP_PMTMR + kvm_context->no_pmtmr_creation = 0; +#endif #ifdef KVM_CAP_SET_GUEST_DEBUG QTAILQ_INIT(&kvm_state->kvm_sw_breakpoints); @@ -237,6 +240,22 @@ void kvm_disable_pit_creation(kvm_contex kvm->no_pit_creation = 1; } +#ifdef KVM_CAP_PMTMR +void (*kvm_arch_pmtmr_handler)(kvm_context_t kvm); +/* + * This handler is called by + * - the monitor thread during vm_start(). + * - the ACPI PM code during pm_io_space_update(). + * It is a 'no-op' if an architecture-specific handler is not available. + * Architecture-specific code must configure the in-kernel PM Timer emulation. + */ +void kvm_pmtmr_handler(void) +{ + if (kvm_arch_pmtmr_handler) + kvm_arch_pmtmr_handler(kvm_context); +} +#endif + static void kvm_reset_vcpu(void *opaque) { CPUState *env = opaque; diff -up ./qemu-kvm.h.orig3 ./qemu-kvm.h --- ./qemu-kvm.h.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./qemu-kvm.h 2010-12-10 11:26:43.726790319 +0100 @@ -64,6 +64,10 @@ struct kvm_context { int irqchip_inject_ioctl; /// do not create in-kernel pit if set int no_pit_creation; +#ifdef KVM_CAP_PMTMR + /// do not create in-kernel PM Timer if set + int no_pmtmr_creation; +#endif #ifdef KVM_CAP_IRQ_ROUTING struct kvm_irq_routing *irq_routes; int nr_allocated_irq_routes; @@ -655,8 +659,14 @@ int kvm_qemu_create_memory_alias(uint64_ uint64_t target_phys); int kvm_qemu_destroy_memory_alias(uint64_t phys_start); -int kvm_arch_qemu_create_context(void); +#ifdef KVM_CAP_PMTMR +void kvm_pmtmr_handler(void); +int kvm_arch_configure_pmtmr(kvm_context_t kvm, struct kvm_pmtmr_config *conf); +extern void (*kvm_arch_pmtmr_handler)(kvm_context_t kvm); +extern uint64_t (*kvm_arch_get_pm_io_base)(void); +#endif +int kvm_arch_qemu_create_context(void); void kvm_arch_save_regs(CPUState *env); void kvm_arch_load_regs(CPUState *env, int level); int kvm_arch_has_work(CPUState *env); diff -up ./qemu-kvm-x86.c.orig3 ./qemu-kvm-x86.c --- ./qemu-kvm-x86.c.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./qemu-kvm-x86.c 2010-12-10 11:26:39.665811451 +0100 @@ -15,6 +15,9 @@ #include <sys/io.h> #include "qemu-kvm.h" +#ifdef KVM_CAP_PMTMR +#include "qemu-timer.h" +#endif #include "libkvm.h" #include <pthread.h> #include <sys/utsname.h> @@ -124,6 +127,61 @@ static int kvm_create_pit(kvm_context_t return 0; } +#ifdef KVM_CAP_PMTMR + +int kvm_arch_configure_pmtmr(kvm_context_t kvm, struct kvm_pmtmr_config *conf) +{ + int r; + + if (kvm_arch_pmtmr_handler) { + r = kvm_vm_ioctl(kvm_state, KVM_CONFIGURE_PMTMR, conf); + if (r < 0) { + fprintf(stderr, "Configure kernel PM Timer failed\n"); + kvm_arch_pmtmr_handler = 0; + } + } + return 0; +} + +uint64_t (*kvm_arch_get_pm_io_base)(void); +/* + * Architecture-specfic code called by kvm_pmtmr_handler(). + * Configures the in-kernel PM Timer emulation if the ACPI PM code provides + * a function to obtain the base address of the PM I/O port address range. + */ +static void kvm_arch_configure_pmtmr_wrapper(kvm_context_t kvm) +{ + struct kvm_pmtmr_config conf; + + if (kvm_arch_get_pm_io_base) { + conf.pm_io_base = kvm_arch_get_pm_io_base(); + conf.clock_offset = cpu_get_clock_offset(); + kvm_arch_configure_pmtmr(kvm, &conf); + } +} + +static int kvm_arch_create_pmtmr(kvm_context_t kvm) +{ + int r; + + if (!kvm->no_pmtmr_creation) { + r = kvm_ioctl(kvm_state, KVM_CHECK_EXTENSION, KVM_CAP_PMTMR); + if (r <= 0) + return 0; + + r = kvm_vm_ioctl(kvm_state, KVM_CREATE_PMTMR); + if (r < 0) { + fprintf(stderr, "Create kernel PM Timer failed\n"); + return r; + } + /* for kvm_pmtmr_handler() */ + kvm_arch_pmtmr_handler = kvm_arch_configure_pmtmr_wrapper; + } + return 0; +} + +#endif + int kvm_arch_create(kvm_context_t kvm, unsigned long phys_mem_bytes, void **vm_mem) { @@ -156,7 +214,9 @@ int kvm_arch_create(kvm_context_t kvm, u if (r < 0) { return r; } - +#ifdef KVM_CAP_PMTMR + kvm_arch_create_pmtmr(kvm); +#endif return 0; } diff -up ./qemu-timer.c.orig3 ./qemu-timer.c --- ./qemu-timer.c.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./qemu-timer.c 2010-12-10 10:45:27.071749627 +0100 @@ -110,6 +110,11 @@ static int64_t cpu_get_clock(void) } } +int64_t cpu_get_clock_offset(void) +{ + return timers_state.cpu_clock_offset; +} + /* FIXME: qemu-kvm hack */ #define CONFIG_IOTHREAD 1 #ifndef CONFIG_IOTHREAD diff -up ./qemu-timer.h.orig3 ./qemu-timer.h --- ./qemu-timer.h.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./qemu-timer.h 2010-12-10 10:45:33.367685692 +0100 @@ -53,6 +53,7 @@ int qemu_calculate_timeout(void); void init_clocks(void); int init_timer_alarm(void); void quit_timers(void); +int64_t cpu_get_clock_offset(void); static inline int64_t get_ticks_per_sec(void) { diff -up ./vl.c.orig3 ./vl.c --- ./vl.c.orig3 2010-12-02 15:15:20.000000000 +0100 +++ ./vl.c 2010-12-10 10:34:55.388997058 +0100 @@ -1091,6 +1091,14 @@ void vm_start(void) { if (!vm_running) { cpu_enable_ticks(); +#ifdef KVM_CAP_PMTMR + /* + * cpu_enable_ticks() has updated 'timers_state.cpu_clock_offset'. + * The following call is needed to pass the updated clock offset + * to the in-kernel PM Timer emulation. + */ + kvm_pmtmr_handler(); +#endif vm_running = 1; vm_state_notify(1, 0); resume_all_vcpus(); -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html