On Fri, Feb 10, 2012 at 18:31, Jan Kiszka <jan.kiszka@xxxxxxxxxxx> wrote: > This enables acceleration for MMIO-based TPR registers accesses of > 32-bit Windows guest systems. It is mostly useful with KVM enabled, > either on older Intel CPUs (without flexpriority feature, can also be > manually disabled for testing) or any current AMD processor. > > The approach introduced here is derived from the original version of > qemu-kvm. It was refactored, documented, and extended by support for > user space APIC emulation, both with and without KVM acceleration. The > VMState format was kept compatible, so was the ABI to the option ROM > that implements the guest-side para-virtualized driver service. This > enables seamless migration from qemu-kvm to upstream or, one day, > between KVM and TCG mode. > > The basic concept goes like this: > - VAPIC PV interface consisting of I/O port 0x7e and (for KVM in-kernel > irqchip) a vmcall hypercall is registered > - VAPIC option ROM is loaded into guest > - option ROM activates TPR MMIO access reporting via port 0x7e > - TPR accesses are trapped and patched in the guest to call into option > ROM instead, VAPIC support is enabled > - option ROM TPR helpers track state in memory and invoke hypercall to > poll for pending IRQs if required > > Signed-off-by: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> I must say that I find the approach horrible, patching guests and ROMs and looking up Windows internals. Taking the same approach to extreme, we could for example patch Xen guest to become a KVM guest. Not that I object merging. > --- > Makefile.target | 3 +- > hw/apic.c | 126 ++++++++- > hw/apic_common.c | 64 +++++- > hw/apic_internal.h | 27 ++ > hw/kvm/apic.c | 32 +++ > hw/kvmvapic.c | 774 ++++++++++++++++++++++++++++++++++++++++++++++++++++ > 6 files changed, 1012 insertions(+), 14 deletions(-) > create mode 100644 hw/kvmvapic.c > > diff --git a/Makefile.target b/Makefile.target > index 68481a3..ec7eff8 100644 > --- a/Makefile.target > +++ b/Makefile.target > @@ -230,7 +230,8 @@ obj-y += device-hotplug.o > > # Hardware support > obj-i386-y += mc146818rtc.o pc.o > -obj-i386-y += sga.o apic_common.o apic.o ioapic_common.o ioapic.o piix_pci.o > +obj-i386-y += apic_common.o apic.o kvmvapic.o > +obj-i386-y += sga.o ioapic_common.o ioapic.o piix_pci.o > obj-i386-y += vmport.o > obj-i386-y += pci-hotplug.o smbios.o wdt_ib700.o > obj-i386-y += debugcon.o multiboot.o > diff --git a/hw/apic.c b/hw/apic.c > index 086c544..2ebf3ca 100644 > --- a/hw/apic.c > +++ b/hw/apic.c > @@ -35,6 +35,10 @@ > #define MSI_ADDR_DEST_ID_SHIFT 12 > #define MSI_ADDR_DEST_ID_MASK 0x00ffff0 > > +#define SYNC_FROM_VAPIC 0x1 > +#define SYNC_TO_VAPIC 0x2 > +#define SYNC_ISR_IRR_TO_VAPIC 0x4 Enum, please. > + > static APICCommonState *local_apics[MAX_APICS + 1]; > > static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode); > @@ -78,6 +82,70 @@ static inline int get_bit(uint32_t *tab, int index) > return !!(tab[i] & mask); > } > > +/* return -1 if no bit is set */ > +static int get_highest_priority_int(uint32_t *tab) > +{ > + int i; > + for (i = 7; i >= 0; i--) { > + if (tab[i] != 0) { > + return i * 32 + fls_bit(tab[i]); > + } > + } > + return -1; > +} > + > +static void apic_sync_vapic(APICCommonState *s, int sync_type) > +{ > + VAPICState vapic_state; > + size_t length; > + off_t start; > + int vector; > + > + if (!s->vapic_paddr) { > + return; > + } > + if (sync_type & SYNC_FROM_VAPIC) { > + cpu_physical_memory_rw(s->vapic_paddr, (void *)&vapic_state, > + sizeof(vapic_state), 0); > + s->tpr = vapic_state.tpr; > + } > + if (sync_type & (SYNC_TO_VAPIC | SYNC_ISR_IRR_TO_VAPIC)) { > + start = offsetof(VAPICState, isr); > + length = offsetof(VAPICState, enabled) - offsetof(VAPICState, isr); > + > + if (sync_type & SYNC_TO_VAPIC) { > + assert(qemu_cpu_is_self(s->cpu_env)); > + > + vapic_state.tpr = s->tpr; > + vapic_state.enabled = 1; > + start = 0; > + length = sizeof(VAPICState); > + } > + > + vector = get_highest_priority_int(s->isr); > + if (vector < 0) { > + vector = 0; > + } > + vapic_state.isr = vector & 0xf0; > + > + vapic_state.zero = 0; > + > + vector = get_highest_priority_int(s->irr); > + if (vector < 0) { > + vector = 0; > + } > + vapic_state.irr = vector & 0xff; > + > + cpu_physical_memory_write_rom(s->vapic_paddr + start, > + ((void *)&vapic_state) + start, length); This assumes that the vapic_state structure matches guest what guest expect without conversion. Is this true for i386 on x86_64? I didn't check the structure in question. > + } > +} > + > +static void apic_vapic_base_update(APICCommonState *s) > +{ > + apic_sync_vapic(s, SYNC_TO_VAPIC); > +} > + > static void apic_local_deliver(APICCommonState *s, int vector) > { > uint32_t lvt = s->lvt[vector]; > @@ -239,20 +307,17 @@ static void apic_set_base(APICCommonState *s, uint64_t val) > > static void apic_set_tpr(APICCommonState *s, uint8_t val) > { > - s->tpr = (val & 0x0f) << 4; > - apic_update_irq(s); > + /* Updates from cr8 are ignored while the VAPIC is active */ > + if (!s->vapic_paddr) { > + s->tpr = val << 4; > + apic_update_irq(s); > + } > } > > -/* return -1 if no bit is set */ > -static int get_highest_priority_int(uint32_t *tab) > +static uint8_t apic_get_tpr(APICCommonState *s) > { > - int i; > - for(i = 7; i >= 0; i--) { > - if (tab[i] != 0) { > - return i * 32 + fls_bit(tab[i]); > - } > - } > - return -1; > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > + return s->tpr >> 4; > } > > static int apic_get_ppr(APICCommonState *s) > @@ -312,6 +377,14 @@ static void apic_update_irq(APICCommonState *s) > } > } > > +void apic_poll_irq(DeviceState *d) > +{ > + APICCommonState *s = APIC_COMMON(d); > + > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > + apic_update_irq(s); > +} > + > static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) > { > apic_report_irq_delivered(!get_bit(s->irr, vector_num)); > @@ -321,6 +394,16 @@ static void apic_set_irq(APICCommonState *s, int vector_num, int trigger_mode) > set_bit(s->tmr, vector_num); > else > reset_bit(s->tmr, vector_num); > + if (s->vapic_paddr) { > + apic_sync_vapic(s, SYNC_ISR_IRR_TO_VAPIC); > + /* > + * The vcpu thread needs to see the new IRR before we pull its current > + * TPR value. That way, if we miss a lowering of the TRP, the guest > + * has the chance to notice the new IRR and poll for IRQs on its own. > + */ > + smp_wmb(); > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > + } > apic_update_irq(s); > } > > @@ -334,6 +417,7 @@ static void apic_eoi(APICCommonState *s) > if (!(s->spurious_vec & APIC_SV_DIRECTED_IO) && get_bit(s->tmr, isrv)) { > ioapic_eoi_broadcast(isrv); > } > + apic_sync_vapic(s, SYNC_FROM_VAPIC | SYNC_TO_VAPIC); > apic_update_irq(s); > } > > @@ -471,15 +555,19 @@ int apic_get_interrupt(DeviceState *d) > if (!(s->spurious_vec & APIC_SV_ENABLE)) > return -1; > > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > intno = apic_irq_pending(s); > > if (intno == 0) { > + apic_sync_vapic(s, SYNC_TO_VAPIC); > return -1; > } else if (intno < 0) { > + apic_sync_vapic(s, SYNC_TO_VAPIC); > return s->spurious_vec & 0xff; > } > reset_bit(s->irr, intno); > set_bit(s->isr, intno); > + apic_sync_vapic(s, SYNC_TO_VAPIC); > apic_update_irq(s); > return intno; > } > @@ -576,6 +664,10 @@ static uint32_t apic_mem_readl(void *opaque, target_phys_addr_t addr) > val = 0x11 | ((APIC_LVT_NB - 1) << 16); /* version 0x11 */ > break; > case 0x08: > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > + if (apic_report_tpr_access) { > + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_READ); > + } > val = s->tpr; > break; > case 0x09: > @@ -675,7 +767,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > case 0x03: > break; > case 0x08: > + if (apic_report_tpr_access) { > + cpu_report_tpr_access(s->cpu_env, TPR_ACCESS_WRITE); > + } > s->tpr = val; > + apic_sync_vapic(s, SYNC_TO_VAPIC); > apic_update_irq(s); > break; > case 0x09: > @@ -737,6 +833,11 @@ static void apic_mem_writel(void *opaque, target_phys_addr_t addr, uint32_t val) > } > } > > +static void apic_pre_save(APICCommonState *s) > +{ > + apic_sync_vapic(s, SYNC_FROM_VAPIC); > +} > + > static void apic_post_load(APICCommonState *s) > { > if (s->timer_expiry != -1) { > @@ -770,7 +871,10 @@ static void apic_class_init(ObjectClass *klass, void *data) > k->init = apic_init; > k->set_base = apic_set_base; > k->set_tpr = apic_set_tpr; > + k->get_tpr = apic_get_tpr; > + k->vapic_base_update = apic_vapic_base_update; > k->external_nmi = apic_external_nmi; > + k->pre_save = apic_pre_save; > k->post_load = apic_post_load; > } > > diff --git a/hw/apic_common.c b/hw/apic_common.c > index 588531b..1977da7 100644 > --- a/hw/apic_common.c > +++ b/hw/apic_common.c > @@ -20,8 +20,10 @@ > #include "apic.h" > #include "apic_internal.h" > #include "trace.h" > +#include "kvm.h" > > static int apic_irq_delivered; > +bool apic_report_tpr_access; This should go to APICCommonState. > > void cpu_set_apic_base(DeviceState *d, uint64_t val) > { > @@ -63,13 +65,44 @@ void cpu_set_apic_tpr(DeviceState *d, uint8_t val) > > uint8_t cpu_get_apic_tpr(DeviceState *d) > { > + APICCommonState *s; > + APICCommonClass *info; > + > + if (!d) { > + return 0; > + } > + > + s = APIC_COMMON(d); > + info = APIC_COMMON_GET_CLASS(s); > + > + return info->get_tpr(s); > +} > + > +void apic_enable_tpr_access_reporting(DeviceState *d) > +{ > + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); > + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); > + > + apic_report_tpr_access = true; > + if (info->enable_tpr_reporting) { > + info->enable_tpr_reporting(s); > + } > +} > + > +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr) > +{ > APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); > + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); > > - return s ? s->tpr >> 4 : 0; > + s->vapic_paddr = paddr; > + info->vapic_base_update(s); > } > > void apic_handle_tpr_access_report(DeviceState *d, target_ulong ip, int access) > { > + APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); > + > + vapic_report_tpr_access(s->vapic, s->cpu_env, ip, access); > } > > void apic_report_irq_delivered(int delivered) > @@ -170,12 +203,16 @@ void apic_init_reset(DeviceState *d) > static void apic_reset_common(DeviceState *d) > { > APICCommonState *s = DO_UPCAST(APICCommonState, busdev.qdev, d); > + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); > bool bsp; > > bsp = cpu_is_bsp(s->cpu_env); > s->apicbase = 0xfee00000 | > (bsp ? MSR_IA32_APICBASE_BSP : 0) | MSR_IA32_APICBASE_ENABLE; > > + s->vapic_paddr = 0; > + info->vapic_base_update(s); > + > apic_init_reset(d); > > if (bsp) { > @@ -238,6 +275,7 @@ static int apic_init_common(SysBusDevice *dev) > { > APICCommonState *s = APIC_COMMON(dev); > APICCommonClass *info; > + static DeviceState *vapic; > static int apic_no; > > if (apic_no >= MAX_APICS) { > @@ -248,10 +286,29 @@ static int apic_init_common(SysBusDevice *dev) > info = APIC_COMMON_GET_CLASS(s); > info->init(s); > > - sysbus_init_mmio(&s->busdev, &s->io_memory); > + sysbus_init_mmio(dev, &s->io_memory); > + > + if (!vapic && s->vapic_control & VAPIC_ENABLE_MASK) { > + vapic = sysbus_create_simple("kvmvapic", -1, NULL); > + } > + s->vapic = vapic; > + if (apic_report_tpr_access && info->enable_tpr_reporting) { I think you should not rely on apic_report_tpr_access being in sane condition during class init. > + info->enable_tpr_reporting(s); > + } > + > return 0; > } > > +static void apic_dispatch_pre_save(void *opaque) > +{ > + APICCommonState *s = APIC_COMMON(opaque); > + APICCommonClass *info = APIC_COMMON_GET_CLASS(s); > + > + if (info->pre_save) { > + info->pre_save(s); > + } > +} > + > static int apic_dispatch_post_load(void *opaque, int version_id) > { > APICCommonState *s = APIC_COMMON(opaque); > @@ -269,6 +326,7 @@ static const VMStateDescription vmstate_apic_common = { > .minimum_version_id = 3, > .minimum_version_id_old = 1, > .load_state_old = apic_load_old, > + .pre_save = apic_dispatch_pre_save, > .post_load = apic_dispatch_post_load, > .fields = (VMStateField[]) { > VMSTATE_UINT32(apicbase, APICCommonState), > @@ -298,6 +356,8 @@ static const VMStateDescription vmstate_apic_common = { > static Property apic_properties_common[] = { > DEFINE_PROP_UINT8("id", APICCommonState, id, -1), > DEFINE_PROP_PTR("cpu_env", APICCommonState, cpu_env), > + DEFINE_PROP_BIT("vapic", APICCommonState, vapic_control, VAPIC_ENABLE_BIT, > + true), > DEFINE_PROP_END_OF_LIST(), > }; > > diff --git a/hw/apic_internal.h b/hw/apic_internal.h > index 0cab010..95cc7cf 100644 > --- a/hw/apic_internal.h > +++ b/hw/apic_internal.h > @@ -61,6 +61,9 @@ > #define APIC_SV_DIRECTED_IO (1<<12) > #define APIC_SV_ENABLE (1<<8) > > +#define VAPIC_ENABLE_BIT 0 > +#define VAPIC_ENABLE_MASK (1 << VAPIC_ENABLE_BIT) > + > #define MAX_APICS 255 > > #define MSI_SPACE_SIZE 0x100000 > @@ -82,7 +85,11 @@ typedef struct APICCommonClass > void (*init)(APICCommonState *s); > void (*set_base)(APICCommonState *s, uint64_t val); > void (*set_tpr)(APICCommonState *s, uint8_t val); > + uint8_t (*get_tpr)(APICCommonState *s); > + void (*enable_tpr_reporting)(APICCommonState *s); > + void (*vapic_base_update)(APICCommonState *s); > void (*external_nmi)(APICCommonState *s); > + void (*pre_save)(APICCommonState *s); > void (*post_load)(APICCommonState *s); > } APICCommonClass; > > @@ -114,9 +121,29 @@ struct APICCommonState { > int64_t timer_expiry; > int sipi_vector; > int wait_for_sipi; > + > + uint32_t vapic_control; > + DeviceState *vapic; > + target_phys_addr_t vapic_paddr; /* note: persistence via kvmvapic */ > }; > > +typedef struct VAPICState { > + uint8_t tpr; > + uint8_t isr; > + uint8_t zero; > + uint8_t irr; > + uint8_t enabled; > +} QEMU_PACKED VAPICState; > + > +extern bool apic_report_tpr_access; > + > void apic_report_irq_delivered(int delivered); > bool apic_next_timer(APICCommonState *s, int64_t current_time); > +void apic_enable_tpr_access_reporting(DeviceState *d); > +void apic_enable_vapic(DeviceState *d, target_phys_addr_t paddr); > +void apic_poll_irq(DeviceState *d); > + > +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, > + int access); > > #endif /* !QEMU_APIC_INTERNAL_H */ > diff --git a/hw/kvm/apic.c b/hw/kvm/apic.c > index dfc2ab3..326eb37 100644 > --- a/hw/kvm/apic.c > +++ b/hw/kvm/apic.c > @@ -92,6 +92,35 @@ static void kvm_apic_set_tpr(APICCommonState *s, uint8_t val) > s->tpr = (val & 0x0f) << 4; > } > > +static uint8_t kvm_apic_get_tpr(APICCommonState *s) > +{ > + return s->tpr >> 4; > +} > + > +static void kvm_apic_enable_tpr_reporting(APICCommonState *s) > +{ > + struct kvm_tpr_access_ctl ctl = { > + .enabled = 1 > + }; > + > + kvm_vcpu_ioctl(s->cpu_env, KVM_TPR_ACCESS_REPORTING, &ctl); > +} > + > +static void kvm_apic_vapic_base_update(APICCommonState *s) > +{ > + struct kvm_vapic_addr vapid_addr = { > + .vapic_addr = s->vapic_paddr, > + }; > + int ret; > + > + ret = kvm_vcpu_ioctl(s->cpu_env, KVM_SET_VAPIC_ADDR, &vapid_addr); > + if (ret < 0) { > + fprintf(stderr, "KVM: setting VAPIC address failed (%s)\n", > + strerror(-ret)); > + abort(); > + } > +} > + > static void do_inject_external_nmi(void *data) > { > APICCommonState *s = data; > @@ -129,6 +158,9 @@ static void kvm_apic_class_init(ObjectClass *klass, void *data) > k->init = kvm_apic_init; > k->set_base = kvm_apic_set_base; > k->set_tpr = kvm_apic_set_tpr; > + k->get_tpr = kvm_apic_get_tpr; > + k->enable_tpr_reporting = kvm_apic_enable_tpr_reporting; > + k->vapic_base_update = kvm_apic_vapic_base_update; > k->external_nmi = kvm_apic_external_nmi; > } > > diff --git a/hw/kvmvapic.c b/hw/kvmvapic.c > new file mode 100644 > index 0000000..0c4d304 > --- /dev/null > +++ b/hw/kvmvapic.c > @@ -0,0 +1,774 @@ > +/* > + * TPR optimization for 32-bit Windows guests > + * > + * Copyright (C) 2007-2008 Qumranet Technologies > + * Copyright (C) 2012 Jan Kiszka, Siemens AG > + * > + * This work is licensed under the terms of the GNU GPL version 2, or > + * (at your option) any later version. See the COPYING file in the > + * top-level directory. > + */ > +#include "sysemu.h" > +#include "cpus.h" > +#include "kvm.h" > +#include "apic_internal.h" > + > +#define APIC_DEFAULT_ADDRESS 0xfee00000 > + > +#define VAPIC_IO_PORT 0x7e > + > +#define VAPIC_INACTIVE 0 > +#define VAPIC_ACTIVE 1 > +#define VAPIC_STANDBY 2 Enums, please. > + > +#define VAPIC_CPU_SHIFT 7 > + > +#define ROM_BLOCK_SIZE 512 > +#define ROM_BLOCK_MASK (~(ROM_BLOCK_SIZE - 1)) > + > +typedef struct VAPICHandlers { > + uint32_t set_tpr; > + uint32_t set_tpr_eax; > + uint32_t get_tpr[8]; > + uint32_t get_tpr_stack; > +} QEMU_PACKED VAPICHandlers; > + > +typedef struct GuestROMState { > + char signature[8]; > + uint32_t vaddr; This does not look 64 bit clean. > + uint32_t fixup_start; > + uint32_t fixup_end; > + uint32_t vapic_vaddr; > + uint32_t vapic_size; > + uint32_t vcpu_shift; > + uint32_t real_tpr_addr; > + VAPICHandlers up; > + VAPICHandlers mp; > +} QEMU_PACKED GuestROMState; Why packed, is this passed to guest directly? > + > +typedef struct VAPICROMState { > + SysBusDevice busdev; > + MemoryRegion io; > + MemoryRegion rom; > + bool rom_mapped_writable; I'd put this later to avoid a structure hole. > + uint32_t state; > + uint32_t rom_state_paddr; > + uint32_t rom_state_vaddr; > + uint32_t vapic_paddr; > + uint32_t real_tpr_addr; > + GuestROMState rom_state; > + size_t rom_size; > +} VAPICROMState; > + > +#define TPR_INSTR_IS_WRITE 0x1 > +#define TPR_INSTR_ABS_MODRM 0x2 > +#define TPR_INSTR_MATCH_MODRM_REG 0x4 > + > +typedef struct TPRInstruction { > + uint8_t opcode; > + uint8_t modrm_reg; > + unsigned int flags; > + size_t length; > + off_t addr_offset; > +} TPRInstruction; Also here the order is pessimized. > + > +/* must be sorted by length, shortest first */ > +static const TPRInstruction tpr_instr[] = { > + { /* mov abs to eax */ > + .opcode = 0xa1, > + .length = 5, > + .addr_offset = 1, > + }, > + { /* mov eax to abs */ > + .opcode = 0xa3, > + .flags = TPR_INSTR_IS_WRITE, > + .length = 5, > + .addr_offset = 1, > + }, > + { /* mov r32 to r/m32 */ > + .opcode = 0x89, > + .flags = TPR_INSTR_IS_WRITE | TPR_INSTR_ABS_MODRM, > + .length = 6, > + .addr_offset = 2, > + }, > + { /* mov r/m32 to r32 */ > + .opcode = 0x8b, > + .flags = TPR_INSTR_ABS_MODRM, > + .length = 6, > + .addr_offset = 2, > + }, > + { /* push r/m32 */ > + .opcode = 0xff, > + .modrm_reg = 6, > + .flags = TPR_INSTR_ABS_MODRM | TPR_INSTR_MATCH_MODRM_REG, > + .length = 6, > + .addr_offset = 2, > + }, > + { /* mov imm32, r/m32 (c7/0) */ > + .opcode = 0xc7, > + .modrm_reg = 0, > + .flags = TPR_INSTR_IS_WRITE | TPR_INSTR_ABS_MODRM | > + TPR_INSTR_MATCH_MODRM_REG, > + .length = 10, > + .addr_offset = 2, > + }, > +}; > + > +static void read_guest_rom_state(VAPICROMState *s) > +{ > + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, > + sizeof(GuestROMState), 0); > +} > + > +static void write_guest_rom_state(VAPICROMState *s) > +{ > + cpu_physical_memory_rw(s->rom_state_paddr, (void *)&s->rom_state, > + sizeof(GuestROMState), 1); > +} > + > +static void update_guest_rom_state(VAPICROMState *s) > +{ > + read_guest_rom_state(s); > + > + s->rom_state.real_tpr_addr = cpu_to_le32(s->real_tpr_addr); > + s->rom_state.vcpu_shift = cpu_to_le32(VAPIC_CPU_SHIFT); > + > + write_guest_rom_state(s); > +} > + > +static int find_real_tpr_addr(VAPICROMState *s, CPUState *env) > +{ > + target_phys_addr_t paddr; > + target_ulong addr; > + > + if (s->state == VAPIC_ACTIVE) { > + return 0; > + } > + for (addr = 0xfffff000; addr >= 0x80000000; addr -= TARGET_PAGE_SIZE) { > + paddr = cpu_get_phys_page_debug(env, addr); > + if (paddr != APIC_DEFAULT_ADDRESS) { > + continue; > + } > + s->real_tpr_addr = addr + 0x80; > + update_guest_rom_state(s); > + return 0; > + } This loop looks odd, what should it do, probe for unused address? > + return -1; > +} > + > +static uint8_t modrm_reg(uint8_t modrm) > +{ > + return (modrm >> 3) & 7; > +} > + > +static bool is_abs_modrm(uint8_t modrm) > +{ > + return (modrm & 0xc7) == 0x05; > +} > + > +static bool opcode_matches(uint8_t *opcode, const TPRInstruction *instr) > +{ > + return opcode[0] == instr->opcode && > + (!(instr->flags & TPR_INSTR_ABS_MODRM) || is_abs_modrm(opcode[1])) && > + (!(instr->flags & TPR_INSTR_MATCH_MODRM_REG) || > + modrm_reg(opcode[1]) == instr->modrm_reg); > +} > + > +static int evaluate_tpr_instruction(VAPICROMState *s, CPUState *env, > + target_ulong *pip, int access) > +{ > + const TPRInstruction *instr; > + target_ulong ip = *pip; > + uint8_t opcode[2]; > + uint32_t real_tpr_addr; > + int i; > + > + if ((ip & 0xf0000000) != 0x80000000 && (ip & 0xf0000000) != 0xe0000000) { The constants should be using ULL suffix because target_ulong could be 64 bit, though maybe this is more optimal. > + return -1; > + } > + > + /* > + * Early Windows 2003 SMP initialization contains a > + * > + * mov imm32, r/m32 > + * > + * instruction that is patched by TPR optimization. The problem is that > + * RSP, used by the patched instruction, is zero, so the guest gets a > + * double fault and dies. > + */ > + if (env->regs[R_ESP] == 0) { > + return -1; > + } > + > + if (access == TPR_ACCESS_WRITE && kvm_enabled() && > + !kvm_irqchip_in_kernel()) { > + /* > + * KVM without TPR access reporting calls into the user space APIC on > + * write with IP pointing after the accessing instruction. So we need > + * to look backward to find the reason. > + */ > + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { > + instr = &tpr_instr[i]; > + if (!(instr->flags & TPR_INSTR_IS_WRITE)) { > + continue; > + } > + if (cpu_memory_rw_debug(env, ip - instr->length, opcode, > + sizeof(opcode), 0) < 0) { > + return -1; > + } > + if (opcode_matches(opcode, instr)) { > + ip -= instr->length; > + goto instruction_ok; > + } > + } > + return -1; > + } else { > + if (cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0) < 0) { > + return -1; > + } > + for (i = 0; i < ARRAY_SIZE(tpr_instr); i++) { > + instr = &tpr_instr[i]; > + if (opcode_matches(opcode, instr)) { > + goto instruction_ok; > + } > + } > + return -1; > + } > + > +instruction_ok: > + /* > + * Grab the virtual TPR address from the instruction > + * and update the cached values. > + */ > + if (cpu_memory_rw_debug(env, ip + instr->addr_offset, > + (void *)&real_tpr_addr, > + sizeof(real_tpr_addr), 0) < 0) { > + return -1; > + } > + real_tpr_addr = le32_to_cpu(real_tpr_addr); > + if ((real_tpr_addr & 0xfff) != 0x80) { > + return -1; > + } > + s->real_tpr_addr = real_tpr_addr; > + update_guest_rom_state(s); > + > + *pip = ip; > + return 0; > +} > + > +static int update_rom_mapping(VAPICROMState *s, CPUState *env, target_ulong ip) > +{ > + target_phys_addr_t paddr; > + uint32_t rom_state_vaddr; > + uint32_t pos, patch, offset; > + > + /* nothing to do if already activated */ > + if (s->state == VAPIC_ACTIVE) { > + return 0; > + } > + > + /* bail out if ROM init code was not executed (missing ROM?) */ > + if (s->state == VAPIC_INACTIVE) { > + return -1; > + } > + > + /* find out virtual address of the ROM */ > + rom_state_vaddr = s->rom_state_paddr + (ip & 0xf0000000); > + paddr = cpu_get_phys_page_debug(env, rom_state_vaddr); > + if (paddr == -1) { > + return -1; > + } > + paddr += rom_state_vaddr & ~TARGET_PAGE_MASK; > + if (paddr != s->rom_state_paddr) { > + return -1; > + } > + read_guest_rom_state(s); > + if (memcmp(s->rom_state.signature, "kvm aPiC", 8) != 0) { > + return -1; > + } > + s->rom_state_vaddr = rom_state_vaddr; > + > + /* fixup addresses in ROM if needed */ > + if (rom_state_vaddr == le32_to_cpu(s->rom_state.vaddr)) { > + return 0; > + } > + for (pos = le32_to_cpu(s->rom_state.fixup_start); > + pos < le32_to_cpu(s->rom_state.fixup_end); > + pos += 4) { > + cpu_physical_memory_rw(paddr + pos - s->rom_state.vaddr, > + (void *)&offset, sizeof(offset), 0); > + offset = le32_to_cpu(offset); > + cpu_physical_memory_rw(paddr + offset, (void *)&patch, > + sizeof(patch), 0); > + patch = le32_to_cpu(patch); > + patch += rom_state_vaddr - le32_to_cpu(s->rom_state.vaddr); > + patch = cpu_to_le32(patch); > + cpu_physical_memory_rw(paddr + offset, (void *)&patch, > + sizeof(patch), 1); > + } > + read_guest_rom_state(s); > + s->vapic_paddr = paddr + le32_to_cpu(s->rom_state.vapic_vaddr) - > + le32_to_cpu(s->rom_state.vaddr); > + > + return 0; > +} > + > +/* > + * Tries to read the unique processor number from the Kernel Processor Control > + * Region (KPCR) of 32-bit Windows. Returns -1 if the KPCR cannot be accessed > + * or is considered invalid. > + */ Horrible hack. Is guest OS type or version checked somewhere? > +static int get_kpcr_number(CPUState *env) > +{ > + struct kpcr { > + uint8_t fill1[0x1c]; > + uint32_t self; > + uint8_t fill2[0x31]; > + uint8_t number; > + } QEMU_PACKED kpcr; KPCR. Pointers to Windows documentation would be nice. > + > + if (cpu_memory_rw_debug(env, env->segs[R_FS].base, > + (void *)&kpcr, sizeof(kpcr), 0) < 0 || > + kpcr.self != env->segs[R_FS].base) { > + return -1; > + } > + return kpcr.number; > +} > + > +static int vapic_enable(VAPICROMState *s, CPUState *env) > +{ > + int cpu_number = get_kpcr_number(env); > + target_phys_addr_t vapic_paddr; > + static const uint8_t enabled = 1; > + > + if (cpu_number < 0) { > + return -1; > + } > + vapic_paddr = s->vapic_paddr + > + (((target_phys_addr_t)cpu_number) << VAPIC_CPU_SHIFT); > + cpu_physical_memory_rw(vapic_paddr + offsetof(VAPICState, enabled), > + (void *)&enabled, sizeof(enabled), 1); > + apic_enable_vapic(env->apic_state, vapic_paddr); > + > + s->state = VAPIC_ACTIVE; > + > + return 0; > +} > + > +static void patch_byte(CPUState *env, target_ulong addr, uint8_t byte) > +{ > + cpu_memory_rw_debug(env, addr, &byte, 1, 1); > +} > + > +static void patch_call(VAPICROMState *s, CPUState *env, target_ulong ip, > + uint32_t target) > +{ > + uint32_t offset; > + > + offset = cpu_to_le32(target - ip - 5); > + patch_byte(env, ip, 0xe8); /* call near */ > + cpu_memory_rw_debug(env, ip + 1, (void *)&offset, sizeof(offset), 1); > +} > + > +static void patch_instruction(VAPICROMState *s, CPUState *env, target_ulong ip) > +{ > + target_phys_addr_t paddr; > + VAPICHandlers *handlers; > + uint8_t opcode[2]; > + uint32_t imm32; > + > + if (smp_cpus == 1) { > + handlers = &s->rom_state.up; > + } else { > + handlers = &s->rom_state.mp; > + } > + > + pause_all_vcpus(); > + > + cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0); > + > + switch (opcode[0]) { > + case 0x89: /* mov r32 to r/m32 */ > + patch_byte(env, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */ > + patch_call(s, env, ip + 1, handlers->set_tpr); > + break; > + case 0x8b: /* mov r/m32 to r32 */ > + patch_byte(env, ip, 0x90); > + patch_call(s, env, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]); > + break; > + case 0xa1: /* mov abs to eax */ > + patch_call(s, env, ip, handlers->get_tpr[0]); > + break; > + case 0xa3: /* mov eax to abs */ > + patch_call(s, env, ip, handlers->set_tpr_eax); > + break; > + case 0xc7: /* mov imm32, r/m32 (c7/0) */ > + patch_byte(env, ip, 0x68); /* push imm32 */ > + cpu_memory_rw_debug(env, ip + 6, (void *)&imm32, sizeof(imm32), 0); > + cpu_memory_rw_debug(env, ip + 1, (void *)&imm32, sizeof(imm32), 1); > + patch_call(s, env, ip + 5, handlers->set_tpr); > + break; > + case 0xff: /* push r/m32 */ > + patch_byte(env, ip, 0x50); /* push eax */ > + patch_call(s, env, ip + 1, handlers->get_tpr_stack); > + break; > + default: > + abort(); > + } > + > + resume_all_vcpus(); > + > + paddr = cpu_get_phys_page_debug(env, ip); > + paddr += ip & ~TARGET_PAGE_MASK; > + tb_invalidate_phys_page_range(paddr, paddr + 1, 1); > +} > + > +void vapic_report_tpr_access(DeviceState *dev, void *cpu, target_ulong ip, > + int access) > +{ > + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); > + CPUState *env = cpu; > + > + cpu_synchronize_state(env); > + > + if (evaluate_tpr_instruction(s, env, &ip, access) < 0) { > + if (s->state == VAPIC_ACTIVE) { > + vapic_enable(s, env); > + } > + return; > + } > + if (update_rom_mapping(s, env, ip) < 0) { > + return; > + } > + if (vapic_enable(s, env) < 0) { > + return; > + } > + patch_instruction(s, env, ip); > +} > + > +static void vapic_reset(DeviceState *dev) > +{ > + VAPICROMState *s = DO_UPCAST(VAPICROMState, busdev.qdev, dev); > + > + if (s->state == VAPIC_ACTIVE) { > + s->state = VAPIC_STANDBY; > + } > +} > + > +static int patch_hypercalls(VAPICROMState *s) > +{ > + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; > + static uint8_t vmcall_pattern[] = { const > + 0xb8, 0x1, 0, 0, 0, 0xf, 0x1, 0xc1 > + }; > + static uint8_t outl_pattern[] = { const > + 0xb8, 0x1, 0, 0, 0, 0x90, 0xe7, 0x7e > + }; > + uint8_t alternates[2]; > + uint8_t *pattern; > + uint8_t *patch; > + int patches = 0; > + off_t pos; > + uint8_t *rom; > + > + rom = g_malloc(s->rom_size); > + cpu_physical_memory_rw(rom_paddr, rom, s->rom_size, 0); > + > + for (pos = 0; pos < s->rom_size - sizeof(vmcall_pattern); pos++) { > + if (kvm_irqchip_in_kernel()) { > + pattern = outl_pattern; > + alternates[0] = outl_pattern[7]; > + alternates[1] = outl_pattern[7]; > + patch = &vmcall_pattern[5]; > + } else { > + pattern = vmcall_pattern; > + alternates[0] = vmcall_pattern[7]; > + alternates[1] = 0xd9; /* AMD's VMMCALL */ > + patch = &outl_pattern[5]; > + } > + if (memcmp(rom + pos, pattern, 7) == 0 && > + (rom[pos + 7] == alternates[0] || rom[pos + 7] == alternates[1])) { > + cpu_physical_memory_rw(rom_paddr + pos + 5, patch, 3, 1); > + /* > + * Don't flush the tb here. Under ordinary conditions, the patched > + * calls are miles away from the current IP. Under malicious > + * conditions, the guest could trick us to crash. > + */ > + } > + } > + > + g_free(rom); > + > + if (patches != 0 && patches != 2) { > + return -1; > + } > + > + return 0; > +} > + > +static void vapic_map_rom_writable(VAPICROMState *s) > +{ > + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; > + MemoryRegionSection section; > + MemoryRegion *as; > + size_t rom_size; > + uint8_t *ram; > + > + as = sysbus_address_space(&s->busdev); > + > + if (s->rom_mapped_writable) { > + memory_region_del_subregion(as, &s->rom); > + memory_region_destroy(&s->rom); > + } > + > + /* grab RAM memory region (region @rom_paddr may still be pc.rom) */ > + section = memory_region_find(as, 0, 1); > + > + /* read ROM size from RAM region */ > + ram = memory_region_get_ram_ptr(section.mr); > + rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE; > + s->rom_size = rom_size; > + > + /* We need to round up to avoid creating subpages > + * from which we cannot run code. */ > + rom_size = TARGET_PAGE_ALIGN(rom_size); > + > + memory_region_init_alias(&s->rom, "kvmvapic-rom", section.mr, rom_paddr, > + rom_size); > + memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000); > + s->rom_mapped_writable = true; > +} > + > +static void do_enable_tpr_reporting(void *data) > +{ > + CPUState *env = data; > + > + apic_enable_tpr_access_reporting(env->apic_state); > +} > + > +static void vapic_enable_tpr_reporting(void) > +{ > + CPUState *env = cpu_single_env; > + > + for (env = first_cpu; env != NULL; env = env->next_cpu) { > + run_on_cpu(env, do_enable_tpr_reporting, env); > + } > +} > + > +static int vapic_prepare(VAPICROMState *s) > +{ > + vapic_map_rom_writable(s); > + > + if (patch_hypercalls(s) < 0) { > + return -1; > + } > + > + vapic_enable_tpr_reporting(); > + > + return 0; > +} > + > +static void vapic_write(void *opaque, target_phys_addr_t addr, uint64_t data, > + unsigned int size) > +{ > + CPUState *env = cpu_single_env; > + target_phys_addr_t rom_paddr; > + VAPICROMState *s = opaque; > + > + cpu_synchronize_state(env); > + > + /* > + * The VAPIC supports two PIO-based hypercalls, both via port 0x7E. > + * o 16-bit write access: > + * Reports the option ROM initialization to the hypervisor. Written > + * value is the offset of the state structure in the ROM. > + * o 8-bit write access: > + * Reactivates the VAPIC after a guest hibernation, i.e. after the > + * option ROM content has been re-initialized by a guest power cycle. > + * o 32-bit write access: > + * Poll for pending IRQs, considering the current VAPIC state. > + */ Different operation depending on size? Interesting. > + switch (size) { > + case 2: > + if (s->state != VAPIC_INACTIVE) { > + patch_hypercalls(s); > + break; > + } > + > + rom_paddr = (env->segs[R_CS].base + env->eip) & ROM_BLOCK_MASK; > + s->rom_state_paddr = rom_paddr + data; > + > + if (vapic_prepare(s) < 0) { > + break; > + } > + s->state = VAPIC_STANDBY; > + break; > + case 1: > + if (kvm_enabled()) { > + /* > + * Disable triggering instruction in ROM by writing a NOP. > + * > + * We cannot do this in TCG mode as the reported IP is not > + * reliable. Given the hack level of the whole, it would not be impossible to find the IP using search PC. > + */ > + pause_all_vcpus(); > + patch_byte(env, env->eip - 2, 0x66); > + patch_byte(env, env->eip - 1, 0x90); > + resume_all_vcpus(); > + } > + > + if (s->state == VAPIC_ACTIVE) { > + break; > + } > + if (update_rom_mapping(s, env, env->eip) < 0) { > + break; > + } > + if (find_real_tpr_addr(s, env) < 0) { > + break; > + } > + vapic_enable(s, env); > + break; > + default: > + case 4: > + if (!kvm_irqchip_in_kernel()) { > + apic_poll_irq(env->apic_state); > + } > + break; > + } > +} > + > +static const MemoryRegionOps vapic_ops = { > + .write = vapic_write, > + .endianness = DEVICE_NATIVE_ENDIAN, > +}; > + > +static int vapic_init(SysBusDevice *dev) > +{ > + VAPICROMState *s = FROM_SYSBUS(VAPICROMState, dev); > + > + memory_region_init_io(&s->io, &vapic_ops, s, "kvmvapic", 2); > + sysbus_add_io(dev, VAPIC_IO_PORT, &s->io); > + sysbus_init_ioports(dev, VAPIC_IO_PORT, 2); > + > + option_rom[nb_option_roms].name = "kvmvapic.bin"; > + option_rom[nb_option_roms].bootindex = -1; > + nb_option_roms++; > + > + return 0; > +} > + > +static void do_vapic_enable(void *data) > +{ > + VAPICROMState *s = data; > + > + vapic_enable(s, first_cpu); > +} > + > +static int vapic_post_load(void *opaque, int version_id) > +{ > + VAPICROMState *s = opaque; > + uint8_t *zero; > + > + /* > + * The old implementation of qemu-kvm did not provide the state > + * VAPIC_STANDBY. Reconstruct it. > + */ > + if (s->state == VAPIC_INACTIVE && s->rom_state_paddr != 0) { > + s->state = VAPIC_STANDBY; > + } > + > + if (s->state != VAPIC_INACTIVE) { > + if (vapic_prepare(s) < 0) { > + return -1; > + } > + } > + if (s->state == VAPIC_ACTIVE) { > + if (smp_cpus == 1) { > + run_on_cpu(first_cpu, do_vapic_enable, s); > + } else { > + zero = g_malloc0(s->rom_state.vapic_size); > + cpu_physical_memory_rw(s->vapic_paddr, zero, > + s->rom_state.vapic_size, 1); > + g_free(zero); > + } > + } > + > + return 0; > +} > + > +static const VMStateDescription vmstate_handlers = { > + .name = "kvmvapic-handlers", > + .version_id = 1, > + .minimum_version_id = 1, > + .minimum_version_id_old = 1, > + .fields = (VMStateField[]) { > + VMSTATE_UINT32(set_tpr, VAPICHandlers), > + VMSTATE_UINT32(set_tpr_eax, VAPICHandlers), > + VMSTATE_UINT32_ARRAY(get_tpr, VAPICHandlers, 8), > + VMSTATE_UINT32(get_tpr_stack, VAPICHandlers), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static const VMStateDescription vmstate_guest_rom = { > + .name = "kvmvapic-guest-rom", > + .version_id = 1, > + .minimum_version_id = 1, > + .minimum_version_id_old = 1, > + .fields = (VMStateField[]) { > + VMSTATE_UNUSED(8), /* signature */ > + VMSTATE_UINT32(vaddr, GuestROMState), > + VMSTATE_UINT32(fixup_start, GuestROMState), > + VMSTATE_UINT32(fixup_end, GuestROMState), > + VMSTATE_UINT32(vapic_vaddr, GuestROMState), > + VMSTATE_UINT32(vapic_size, GuestROMState), > + VMSTATE_UINT32(vcpu_shift, GuestROMState), > + VMSTATE_UINT32(real_tpr_addr, GuestROMState), > + VMSTATE_STRUCT(up, GuestROMState, 0, vmstate_handlers, VAPICHandlers), > + VMSTATE_STRUCT(mp, GuestROMState, 0, vmstate_handlers, VAPICHandlers), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static const VMStateDescription vmstate_vapic = { > + .name = "kvm-tpr-opt", /* compatible with qemu-kvm VAPIC */ > + .version_id = 1, > + .minimum_version_id = 1, > + .minimum_version_id_old = 1, > + .post_load = vapic_post_load, > + .fields = (VMStateField[]) { > + VMSTATE_STRUCT(rom_state, VAPICROMState, 0, vmstate_guest_rom, > + GuestROMState), > + VMSTATE_UINT32(state, VAPICROMState), > + VMSTATE_UINT32(real_tpr_addr, VAPICROMState), > + VMSTATE_UINT32(rom_state_vaddr, VAPICROMState), > + VMSTATE_UINT32(vapic_paddr, VAPICROMState), > + VMSTATE_UINT32(rom_state_paddr, VAPICROMState), > + VMSTATE_END_OF_LIST() > + } > +}; > + > +static void vapic_class_init(ObjectClass *klass, void *data) > +{ > + SysBusDeviceClass *sc = SYS_BUS_DEVICE_CLASS(klass); > + DeviceClass *dc = DEVICE_CLASS(klass); > + > + dc->no_user = 1; > + dc->reset = vapic_reset; > + dc->vmsd = &vmstate_vapic; > + sc->init = vapic_init; > +} > + > +static TypeInfo vapic_type = { > + .name = "kvmvapic", > + .parent = TYPE_SYS_BUS_DEVICE, > + .instance_size = sizeof(VAPICROMState), > + .class_init = vapic_class_init, > +}; > + > +static void vapic_register(void) > +{ > + type_register_static(&vapic_type); > +} > + > +device_init(vapic_register); > -- > 1.7.3.4 > > -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html