On 02/05/2012 02:39 PM, Jan Kiszka wrote: > From: Jan Kiszka <jan.kiszka@xxxxxxxxxxx> > > This enables acceleration for MMIO-based TPR registers accesses of > 32-bit Windows guest systems. It is mostly useful with KVM enabled, > either on older Intel CPUs (without flexpriority feature, can also be > manually disabled for testing) or any current AMD processor. > > The approach introduced here is derived from the original version of > qemu-kvm. It was refactored, documented, and extended by support for > user space APIC emulation, both with and without KVM acceleration. However, it's presented as a rewrite instead of a series of changes, so we can't see what the changes are. Having said that, the end result is way, way better than the original. > The > VMState format was kept compatible, so was the ABI to the option ROM > that implements the guest-side para-virtualized driver service. This > enables seamless migration from qemu-kvm to upstream or, one day, > between KVM and TCG mode. > > The basic concept goes like this: > - VAPIC PV interface consisting of I/O port 0x7e and (for KVM in-kernel > irqchip) a vmcall hypercall is registered > - VAPIC option ROM is loaded into guest > - option ROM activates TPR MMIO access reporting via port 0x7e > - TPR accesses are trapped and patched in the guest to call into option > ROM instead, VAPIC support is enabled > - option ROM TPR helpers track state in memory and invoke hypercall to > poll for pending IRQs if required > > + > +static int evaluate_tpr_instruction(VAPICROMState *s, CPUState *env, > + target_ulong *pip, int access) > +{ > + target_ulong addr_offset; > + target_ulong ip = *pip; > + uint8_t opcode[2]; > + uint32_t real_tpr_addr; > + > + if ((ip & 0xf0000000) != 0x80000000 && (ip & 0xf0000000) != 0xe0000000) { > + return -1; > + } > + > + /* > + * Early Windows 2003 SMP initialization contains a > + * > + * mov imm32, r/m32 > + * > + * instruction that is patched by TPR optimization. The problem is that > + * RSP, used by the patched instruction, is zero, so the guest gets a > + * double fault and dies. > + */ > + if (env->regs[R_ESP] == 0) { > + return -1; > + } > + > + if (access == TPR_ACCESS_WRITE && kvm_enabled() && > + !kvm_irqchip_in_kernel()) { > + /* > + * KVM without TPR access reporting calls into the user space APIC on > + * write with IP pointing after the accessing instruction. So we need > + * to look backward to find the reason. > + */ Why do we need to do anything at all? I'm not sure if the ABI guarantees anything about %rip. > + ip -= 5; > + if (cpu_memory_rw_debug(env, ip, opcode, 1, 0) < 0) { > + return -1; > + } > + if (opcode[0] == 0xa3) { > + addr_offset = 1; > + goto instruction_ok; > + } > + > + ip--; > + if (cpu_memory_rw_debug(env, ip, opcode, 2, 0) < 0) { > + return -1; > + } > + if (opcode[0] == 0x89 && is_abs_modrm(opcode[1])) { > + addr_offset = 2; > + goto instruction_ok; > + } > + > + ip -= 4; > + if (cpu_memory_rw_debug(env, ip, opcode, 2, 0) < 0) { > + return -1; > + } > + if (opcode[0] != 0xc7 || modrm_reg(opcode[1]) != 0 || > + !is_abs_modrm(opcode[1])) { > + return -1; > + } > + addr_offset = 2; > + } else { > + if (cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0) < 0) { > + return -1; > + } > + > + switch (opcode[0]) { > + case 0xc7: /* mov imm32, r/m32 (c7/0) */ > + if (modrm_reg(opcode[1]) != 0) { > + return -1; > + } > + /* fall through */ > + case 0x89: /* mov r32 to r/m32 */ > + case 0x8b: /* mov r/m32 to r32 */ > + if (!is_abs_modrm(opcode[1])) { > + return -1; > + } > + addr_offset = 2; > + break; > + case 0xa1: /* mov abs to eax */ > + case 0xa3: /* mov eax to abs */ > + addr_offset = 1; > + break; > + case 0xff: /* push r/m32 */ > + if (modrm_reg(opcode[1]) != 6 || !is_abs_modrm(opcode[1])) { > + return -1; > + } > + addr_offset = 2; > + break; > + default: > + return -1; > + } You could code this as a table of predicate functions and insn sizes. When working backward you use the sizes to offset %rip. This eliminates duplication. > + } > + > +instruction_ok: > + /* > + * Grab the virtual TPR address from the instruction > + * and update the cached values. > + */ > + if (cpu_memory_rw_debug(env, ip + addr_offset, (void *)&real_tpr_addr, > + sizeof(real_tpr_addr), 0) < 0) { > + return -1; > + } > + real_tpr_addr = le32_to_cpu(real_tpr_addr); > + if ((real_tpr_addr & 0xfff) != 0x80) { > + return -1; > + } > + s->real_tpr_addr = real_tpr_addr; > + update_guest_rom_state(s); > + > + *pip = ip; > + return 0; > +} > + > + > +/* > + * Tries to read the unique processor number from the Kernel Processor Control > + * Region (KPCR) of 32-bit Windows. Returns -1 if the KPCR cannot be accessed > + * or is considered invalid. > + */ > +static int get_kpcr_number(CPUState *env) > +{ > + struct kpcr { > + uint8_t fill1[0x1c]; > + uint32_t self; > + uint8_t fill2[0x31]; > + uint8_t number; > + } QEMU_PACKED kpcr; > + > + if (smp_cpus == 1) { > + return 0; > + } > + if (cpu_memory_rw_debug(env, env->segs[R_FS].base, > + (void *)&kpcr, sizeof(kpcr), 0) < 0 || > + kpcr.self != env->segs[R_FS].base) { Ah, so it works. We may want to do it for UP as well, as a way of verifying that the guest is compatible with these hacks. > + return -1; > + } > + return kpcr.number; > +} > +static void patch_instruction(VAPICROMState *s, CPUState *env, target_ulong ip) > +{ > + target_phys_addr_t paddr; > + VAPICHandlers *handlers; > + uint8_t opcode[2]; > + uint32_t imm32; > + > + if (smp_cpus == 1) { > + handlers = &s->rom_state.up; > + } else { > + handlers = &s->rom_state.mp; > + } > + > + cpu_memory_rw_debug(env, ip, opcode, sizeof(opcode), 0); > + > + switch (opcode[0]) { > + case 0x89: /* mov r32 to r/m32 */ > + patch_byte(env, ip, 0x50 + modrm_reg(opcode[1])); /* push reg */ > + patch_call(s, env, ip + 1, handlers->set_tpr); > + break; > + case 0x8b: /* mov r/m32 to r32 */ > + patch_byte(env, ip, 0x90); > + patch_call(s, env, ip + 1, handlers->get_tpr[modrm_reg(opcode[1])]); > + break; > + case 0xa1: /* mov abs to eax */ > + patch_call(s, env, ip, handlers->get_tpr[0]); > + break; > + case 0xa3: /* mov eax to abs */ > + patch_call(s, env, ip, handlers->set_tpr_eax); > + break; > + case 0xc7: /* mov imm32, r/m32 (c7/0) */ > + patch_byte(env, ip, 0x68); /* push imm32 */ > + cpu_memory_rw_debug(env, ip + 6, (void *)&imm32, sizeof(imm32), 0); > + cpu_memory_rw_debug(env, ip + 1, (void *)&imm32, sizeof(imm32), 1); > + patch_call(s, env, ip + 5, handlers->set_tpr); > + break; > + case 0xff: /* push r/m32 */ > + patch_byte(env, ip, 0x50); /* push eax */ > + patch_call(s, env, ip + 1, handlers->get_tpr_stack); > + break; With a table, we could put the patch sequences there as well. > + default: > + abort(); > + } > + > + paddr = cpu_get_phys_page_debug(env, ip); > + paddr += ip & ~TARGET_PAGE_MASK; > + tb_invalidate_phys_page_range(paddr, paddr + 1, 1); > +} > + > + > +static void vapic_map_rom_writable(VAPICROMState *s) > +{ > + target_phys_addr_t rom_paddr = s->rom_state_paddr & ROM_BLOCK_MASK; > + MemoryRegionSection section; > + MemoryRegion *as; > + size_t rom_size; > + uint8_t *ram; > + > + as = sysbus_address_space(&s->busdev); > + > + if (s->rom_mapped_writable) { > + memory_region_del_subregion(as, &s->rom); > + memory_region_destroy(&s->rom); > + } > + > + /* grab RAM memory region (region @rom_paddr may still be pc.rom) */ > + section = memory_region_find(as, 0, 1); > + > + /* read ROM size from RAM region */ > + ram = memory_region_get_ram_ptr(section.mr); > + rom_size = ram[rom_paddr + 2] * ROM_BLOCK_SIZE; > + s->rom_size = rom_size; > + > + /* FIXME: round up as everything underneath would fall apart otherwise > + * (subpages are broken) */ > + rom_size = TARGET_PAGE_ALIGN(rom_size); Ok. Subpages aren't broken (at least, they can't be expected to work here). Without a page aligned region you can't expect kvm to map this area. Wrt tcg, it should work, but I don't think anyone ever tested tcg out of subpage. > + > + memory_region_init_alias(&s->rom, "kvmvapic-rom", section.mr, rom_paddr, > + rom_size); > + memory_region_add_subregion_overlap(as, rom_paddr, &s->rom, 1000); This is incredibly hacky, so at least the spirit of the original code is preserved. > + s->rom_mapped_writable = true; > +} > + > Impressive refactoring overall; I thought that code was a basket case. -- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html