On Thu, Dec 10, 2009 at 08:38:24PM +0200, oritw@xxxxxxxxxx wrote: > From: Orit Wasserman <oritw@xxxxxxxxxx> > > --- > arch/x86/kvm/vmx.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++- > arch/x86/kvm/x86.c | 5 +- > arch/x86/kvm/x86.h | 3 + > 3 files changed, 240 insertions(+), 3 deletions(-) > > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c > index 2726a6c..a7ffd5e 100644 > --- a/arch/x86/kvm/vmx.c > +++ b/arch/x86/kvm/vmx.c > @@ -93,13 +93,39 @@ struct shared_msr_entry { > }; > > struct __attribute__ ((__packed__)) level_state { > + /* Has the level1 guest done vmclear? */ > + bool vmclear; > +}; > + > +/* > + * This structure is mapped to guest memory. > + * It is packed in order to preseve the binary content > + * after live migration. > + * If there are changed in the content or layout the revision_id must be updated. > + */ > +struct __attribute__ ((__packed__)) nested_vmcs_page { > + u32 revision_id; > + u32 abort; > + struct level_state l2_state; > +}; > + > +struct nested_vmcs_list { > + struct list_head list; > + gpa_t vmcs_addr; > + struct vmcs *l2_vmcs; > }; > > struct nested_vmx { > /* Has the level1 guest done vmxon? */ > bool vmxon; > + /* What is the location of the current vmcs l1 keeps for l2 */ > + gpa_t current_vmptr; > /* Level 1 state for switching to level 2 and back */ > struct level_state *l1_state; > + /* list of vmcs for each l2 guest created by l1 */ > + struct list_head l2_vmcs_list; > + /* l2 page corresponding to the current vmcs set by l1 */ > + struct nested_vmcs_page *current_l2_page; > }; > > struct vcpu_vmx { > @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) > return container_of(vcpu, struct vcpu_vmx, vcpu); > } > > +static struct page *nested_get_page(struct kvm_vcpu *vcpu, > + u64 vmcs_addr) > +{ > + struct page *vmcs_page = NULL; > + > + down_read(¤t->mm->mmap_sem); > + vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr >> PAGE_SHIFT); > + up_read(¤t->mm->mmap_sem); > + > + if (is_error_page(vmcs_page)) { > + printk(KERN_ERR "%s error allocating page 0x%llx\n", > + __func__, vmcs_addr); > + kvm_release_page_clean(vmcs_page); > + return NULL; > + } > + > + return vmcs_page; > + > +} > + > +static int nested_map_current(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct page *vmcs_page = > + nested_get_page(vcpu, vmx->nested.current_vmptr); > + struct nested_vmcs_page *mapped_page; > + > + if (vmcs_page == NULL) { > + printk(KERN_INFO "%s: failure in nested_get_page\n", __func__); > + return 0; > + } > + > + if (vmx->nested.current_l2_page) { > + printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__); > + WARN_ON(1); > + return 0; > + } > + > + mapped_page = kmap_atomic(vmcs_page, KM_USER0); > + > + if (!mapped_page) { > + printk(KERN_INFO "%s: error in kmap_atomic\n", __func__); > + return 0; > + } > + > + vmx->nested.current_l2_page = mapped_page; > + > + return 1; > +} > + > +static void nested_unmap_current(struct kvm_vcpu *vcpu) > +{ > + struct page *page; > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + > + if (!vmx->nested.current_l2_page) { > + printk(KERN_INFO "Shadow vmcs already unmapped\n"); > + WARN_ON(1); > + return; > + } > + > + page = kmap_atomic_to_page(vmx->nested.current_l2_page); > + > + kunmap_atomic(vmx->nested.current_l2_page, KM_USER0); > + > + kvm_release_page_dirty(page); > + > + vmx->nested.current_l2_page = NULL; > +} > + > static int init_rmode(struct kvm *kvm); > static u64 construct_eptp(unsigned long root_hpa); > > @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) > return 0; > } > > +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry) > +{ > + int r = 0; > + uint size; > + > + *gentry = 0; > + > + if (is_long_mode(vcpu)) > + size = sizeof(u64); > + else > + size = sizeof(u32); > + > + r = kvm_read_guest_virt(gva, gentry, > + size, vcpu); > + if (r) { > + printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n", > + __func__, vcpu->arch.regs[VCPU_REGS_RAX], r); > + return r; > + } > + > + if (!IS_ALIGNED(*gentry, PAGE_SIZE)) { > + printk(KERN_DEBUG "%s addr %llx not aligned\n", > + __func__, *gentry); > + return 1; > + } > + > + return 0; > +} > + > /* > * Writes msr value into into the appropriate "register". > * Returns 0 on success, non-0 otherwise. > @@ -1316,6 +1441,7 @@ static int create_l1_state(struct kvm_vcpu *vcpu) > } else > return 0; > > + INIT_LIST_HEAD(&(vmx->nested.l2_vmcs_list)); > return 0; > } > > @@ -1488,15 +1614,35 @@ static void free_vmcs(struct vmcs *vmcs) > free_pages((unsigned long)vmcs, vmcs_config.order); > } > > +static void nested_free_current_vmcs(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct nested_vmcs_list *list_item, *n; > + > + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, list) > + if (list_item->vmcs_addr == vmx->nested.current_vmptr) { > + free_vmcs(list_item->l2_vmcs); > + list_del(&(list_item->list)); > + return; > + } > +} > + > static void free_l1_state(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct nested_vmcs_list *list_item, *n; > > if (!vmx->nested.l1_state) > return; > > kfree(vmx->nested.l1_state); > vmx->nested.l1_state = NULL; > + > + list_for_each_entry_safe(list_item, n, &vmx->nested.l2_vmcs_list, > + list) { > + free_vmcs(list_item->l2_vmcs); > + list_del(&(list_item->list)); > + } > } > > > @@ -3352,6 +3498,93 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu) > return 1; > } > > +static void clear_rflags_cf_zf(struct kvm_vcpu *vcpu) > +{ > + unsigned long rflags; > + rflags = vmx_get_rflags(vcpu); > + rflags &= ~(X86_EFLAGS_CF | X86_EFLAGS_ZF); > + vmx_set_rflags(vcpu, rflags); > +} > + > +/* > + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11 > + * For additional information regarding offset calculation see 3.7.5 > + */ > +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu, > + unsigned long exit_qualification, > + u32 vmx_instruction_info) > +{ > + int scaling = vmx_instruction_info & 3; /* bits 0:1 scaling */ > + int addr_size = (vmx_instruction_info >> 7) & 7; /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */ > + bool is_reg = vmx_instruction_info & (1u << 10); /* bit 10 1=register operand, 0= memory */ > + int seg_reg = (vmx_instruction_info >> 15) & 7; /* bits 15:17 segment register */ > + int index_reg = (vmx_instruction_info >> 18) & 0xf; /* bits 18:21 index register */ > + bool index_is_valid = !(vmx_instruction_info & (1u << 22)); /* bit 22 index register validity, 0=valid, 1=invalid */ > + int base_reg = (vmx_instruction_info >> 23) & 0xf; /* bits 23:26 index register */ > + bool base_is_valid = !(vmx_instruction_info & (1u << 27)); /* bit 27 base register validity, 0=valid, 1=invalid */ > + gva_t addr; > + > + if (is_reg) > + return 0; > + > + switch (addr_size) { > + case 1: > + exit_qualification &= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */ > + break; > + case 2: > + break; > + default: > + return 0; > + } > + > + /* Addr = segment_base + offset */ > + /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */ > + addr = vmx_get_segment_base(vcpu, seg_reg); > + if (base_is_valid) > + addr += kvm_register_read(vcpu, base_reg); > + if (index_is_valid) > + addr += kvm_register_read(vcpu, index_reg)*scaling; > + addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */ > + > + return addr; > +} > + > +static int handle_vmclear(struct kvm_vcpu *vcpu) > +{ > + struct vcpu_vmx *vmx = to_vmx(vcpu); > + struct level_state *l2_state; > + gpa_t guest_vmcs_addr; > + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); > + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); > + gva_t vmcs_gva; > + > + if (!nested_vmx_check_permission(vcpu)) > + return 1; > + > + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, > + vmx_instruction_info); > + > + if (read_guest_vmcs_gpa(vcpu, vmcs_gva, &guest_vmcs_addr)) > + return 1; > + Should check that vmcs address is 4K aligned and given address is not equal to vmxon pointer. > + vmx->nested.current_vmptr = guest_vmcs_addr; vmclear doesn't change current vmcs pointer. > + if (!nested_map_current(vcpu)) > + return 1; > + > + l2_state = &(to_vmx(vcpu)->nested.current_l2_page->l2_state); > + l2_state->vmclear = 1; > + nested_free_current_vmcs(vcpu); > + > + vmx->nested.current_vmptr = -1ull; > + vmclear reset current vmcs pointer to -1 only if it was called with current vmcs pointer as an argument. > + nested_unmap_current(vcpu); > + > + skip_emulated_instruction(vcpu); > + clear_rflags_cf_zf(vcpu); > + > + return 1; > +} > + > static int handle_vmoff(struct kvm_vcpu *vcpu) > { > struct vcpu_vmx *vmx = to_vmx(vcpu); > @@ -3695,7 +3928,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = { > [EXIT_REASON_HLT] = handle_halt, > [EXIT_REASON_INVLPG] = handle_invlpg, > [EXIT_REASON_VMCALL] = handle_vmcall, > - [EXIT_REASON_VMCLEAR] = handle_vmx_insn, > + [EXIT_REASON_VMCLEAR] = handle_vmclear, > [EXIT_REASON_VMLAUNCH] = handle_vmx_insn, > [EXIT_REASON_VMPTRLD] = handle_vmx_insn, > [EXIT_REASON_VMPTRST] = handle_vmx_insn, > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index b698952..e5acf22 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -2773,8 +2773,8 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) > return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); > } > > -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, > - struct kvm_vcpu *vcpu) > +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, > + struct kvm_vcpu *vcpu) > { > void *data = val; > int r = X86EMUL_CONTINUE; > @@ -2802,6 +2802,7 @@ static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, > out: > return r; > } > +EXPORT_SYMBOL_GPL(kvm_read_guest_virt); > > static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, > struct kvm_vcpu *vcpu) > diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h > index 57204cb..2d7b2dc 100644 > --- a/arch/x86/kvm/x86.h > +++ b/arch/x86/kvm/x86.h > @@ -35,6 +35,9 @@ static inline bool kvm_exception_is_soft(unsigned int nr) > struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, > u32 function, u32 index); > > +int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, > + struct kvm_vcpu *vcpu); > + > extern int nested; > > #endif > -- > 1.6.0.4 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- Gleb. -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html