On 12/10/2009 08:38 PM, oritw@xxxxxxxxxx wrote:
From: Orit Wasserman<oritw@xxxxxxxxxx> --- arch/x86/kvm/vmx.c | 235 +++++++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 5 +- arch/x86/kvm/x86.h | 3 + 3 files changed, 240 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2726a6c..a7ffd5e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -93,13 +93,39 @@ struct shared_msr_entry { }; struct __attribute__ ((__packed__)) level_state { + /* Has the level1 guest done vmclear? */ + bool vmclear; +};
Suggest calling it launch_state and using an enum. We can have three states: uninitialized, clear, and launched. Not sure if this is really required by the spec.
Do we need vmclear in l1_state?
+struct __attribute__ ((__packed__)) nested_vmcs_page { + u32 revision_id; + u32 abort; + struct level_state l2_state; +}; + +struct nested_vmcs_list { + struct list_head list;
'link'
+ gpa_t vmcs_addr; + struct vmcs *l2_vmcs; }; struct nested_vmx { /* Has the level1 guest done vmxon? */ bool vmxon; + /* What is the location of the current vmcs l1 keeps for l2 */ + gpa_t current_vmptr; /* Level 1 state for switching to level 2 and back */ struct level_state *l1_state; + /* list of vmcs for each l2 guest created by l1 */ + struct list_head l2_vmcs_list; + /* l2 page corresponding to the current vmcs set by l1 */ + struct nested_vmcs_page *current_l2_page; }; struct vcpu_vmx { @@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) return container_of(vcpu, struct vcpu_vmx, vcpu); } +static struct page *nested_get_page(struct kvm_vcpu *vcpu, + u64 vmcs_addr) +{ + struct page *vmcs_page = NULL; + + down_read(¤t->mm->mmap_sem); + vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr>> PAGE_SHIFT); + up_read(¤t->mm->mmap_sem);
gfn_to_page() doesn't need mmap_sem (and may deadlock if you take it).
+ + if (is_error_page(vmcs_page)) { + printk(KERN_ERR "%s error allocating page 0x%llx\n", + __func__, vmcs_addr); + kvm_release_page_clean(vmcs_page); + return NULL; + } + + return vmcs_page; + +} + +static int nested_map_current(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct page *vmcs_page = + nested_get_page(vcpu, vmx->nested.current_vmptr); + struct nested_vmcs_page *mapped_page; + + if (vmcs_page == NULL) { + printk(KERN_INFO "%s: failure in nested_get_page\n", __func__); + return 0; + } + + if (vmx->nested.current_l2_page) { + printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__); + WARN_ON(1); + return 0; + } + + mapped_page = kmap_atomic(vmcs_page, KM_USER0); + + if (!mapped_page) { + printk(KERN_INFO "%s: error in kmap_atomic\n", __func__); + return 0; + }
kmap_atomic() can't fail.
+ + vmx->nested.current_l2_page = mapped_page; + + return 1; +} + +static void nested_unmap_current(struct kvm_vcpu *vcpu) +{ + struct page *page; + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (!vmx->nested.current_l2_page) { + printk(KERN_INFO "Shadow vmcs already unmapped\n"); + WARN_ON(1);
Use BUG_ON(), since this can't happen unless there's a bug.
+ return; + } + + page = kmap_atomic_to_page(vmx->nested.current_l2_page); + + kunmap_atomic(vmx->nested.current_l2_page, KM_USER0); + + kvm_release_page_dirty(page); + + vmx->nested.current_l2_page = NULL; +} + static int init_rmode(struct kvm *kvm); static u64 construct_eptp(unsigned long root_hpa); @@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return 0; } +static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry) +{ + int r = 0; + uint size; + + *gentry = 0; + + if (is_long_mode(vcpu)) + size = sizeof(u64); + else + size = sizeof(u32);
I think the gpa is always 64 bit, regardless of the current mode.
+ + r = kvm_read_guest_virt(gva, gentry, + size, vcpu); + if (r) { + printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n", + __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
RAX may not be relevant. Just return, and the user can disassemble the instructions and see for themselves.
+ return r; + } + + if (!IS_ALIGNED(*gentry, PAGE_SIZE)) { + printk(KERN_DEBUG "%s addr %llx not aligned\n", + __func__, *gentry); + return 1; + } + + return 0; +} + +/* + * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11 + * For additional information regarding offset calculation see 3.7.5 + */ +static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu, + unsigned long exit_qualification, + u32 vmx_instruction_info) +{ + int scaling = vmx_instruction_info& 3; /* bits 0:1 scaling */ + int addr_size = (vmx_instruction_info>> 7)& 7; /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */ + bool is_reg = vmx_instruction_info& (1u<< 10); /* bit 10 1=register operand, 0= memory */ + int seg_reg = (vmx_instruction_info>> 15)& 7; /* bits 15:17 segment register */ + int index_reg = (vmx_instruction_info>> 18)& 0xf; /* bits 18:21 index register */ + bool index_is_valid = !(vmx_instruction_info& (1u<< 22)); /* bit 22 index register validity, 0=valid, 1=invalid */ + int base_reg = (vmx_instruction_info>> 23)& 0xf; /* bits 23:26 index register */ + bool base_is_valid = !(vmx_instruction_info& (1u<< 27)); /* bit 27 base register validity, 0=valid, 1=invalid */ + gva_t addr; + + if (is_reg) + return 0;
Should #UD.
+ + switch (addr_size) { + case 1: + exit_qualification&= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */ + break; + case 2: + break; + default: + return 0; + } + + /* Addr = segment_base + offset */ + /* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */ + addr = vmx_get_segment_base(vcpu, seg_reg); + if (base_is_valid) + addr += kvm_register_read(vcpu, base_reg); + if (index_is_valid) + addr += kvm_register_read(vcpu, index_reg)*scaling;
Shouldn't this be a shift? Wish we had something like that for emulate.c.
+ addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */ + + return addr; +} + +static int handle_vmclear(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct level_state *l2_state; + gpa_t guest_vmcs_addr; + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); + gva_t vmcs_gva; + + if (!nested_vmx_check_permission(vcpu)) + return 1; + + vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification, + vmx_instruction_info);
I think you can let get_vmx_mem_address() do the vmread()s, simpler.
+ + if (read_guest_vmcs_gpa(vcpu, vmcs_gva,&guest_vmcs_addr)) + return 1; + + vmx->nested.current_vmptr = guest_vmcs_addr; + if (!nested_map_current(vcpu)) + return 1; + + l2_state =&(to_vmx(vcpu)->nested.current_l2_page->l2_state); + l2_state->vmclear = 1; + nested_free_current_vmcs(vcpu);
Why free? Isn't the purpose of the list to keep those active?
+ + vmx->nested.current_vmptr = -1ull; + + nested_unmap_current(vcpu); + + skip_emulated_instruction(vcpu); + clear_rflags_cf_zf(vcpu); + + return 1; +} +
As usual, if you can split some of the infrastructure into separate patches, it would help review.
-- error compiling committee.c: too many arguments to function -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html