Re: [PATCH 2/7] Nested VMX patch 2 implements vmclear

Avi Kivity <avi@xxxxxxxxxx> · Wed, 16 Dec 2009 15:59:49 +0200

On 12/10/2009 08:38 PM, oritw@xxxxxxxxxx wrote:
From: Orit Wasserman<oritw@xxxxxxxxxx>

---
  arch/x86/kvm/vmx.c |  235 +++++++++++++++++++++++++++++++++++++++++++++++++++-
  arch/x86/kvm/x86.c |    5 +-
  arch/x86/kvm/x86.h |    3 +
  3 files changed, 240 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2726a6c..a7ffd5e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -93,13 +93,39 @@ struct shared_msr_entry {
  };

  struct __attribute__ ((__packed__)) level_state {
+	/* Has the level1 guest done vmclear? */
+	bool vmclear;
+};
   

Suggest calling it launch_state and using an enum.  We can have three 
states: uninitialized, clear, and launched.  Not sure if this is really 
required by the spec.

Do we need vmclear in l1_state?

+struct __attribute__ ((__packed__)) nested_vmcs_page {
+	u32 revision_id;
+	u32 abort;
+	struct level_state l2_state;
+};
+
+struct nested_vmcs_list {
+	struct list_head list;
   

'link'

+	gpa_t vmcs_addr;
+	struct vmcs *l2_vmcs;
  };

  struct nested_vmx {
  	/* Has the level1 guest done vmxon? */
  	bool vmxon;
+	/* What is the location of the current vmcs l1 keeps for l2 */
+	gpa_t current_vmptr;
  	/* Level 1 state for switching to level 2 and back */
  	struct level_state *l1_state;
+	/* list of vmcs for each l2 guest created by l1 */
+	struct list_head l2_vmcs_list;
+	/* l2 page corresponding to the current vmcs set by l1 */
+	struct nested_vmcs_page *current_l2_page;
  };

  struct vcpu_vmx {
@@ -156,6 +182,76 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
  	return container_of(vcpu, struct vcpu_vmx, vcpu);
  }

+static struct page *nested_get_page(struct kvm_vcpu *vcpu,
+				    u64 vmcs_addr)
+{
+	struct page *vmcs_page = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	vmcs_page = gfn_to_page(vcpu->kvm, vmcs_addr>>  PAGE_SHIFT);
+	up_read(&current->mm->mmap_sem);
   

gfn_to_page() doesn't need mmap_sem (and may deadlock if you take it).

+
+	if (is_error_page(vmcs_page)) {
+		printk(KERN_ERR "%s error allocating page 0x%llx\n",
+		       __func__, vmcs_addr);
+		kvm_release_page_clean(vmcs_page);
+		return NULL;
+	}
+
+	return vmcs_page;
+
+}
+
+static int nested_map_current(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct page *vmcs_page =
+		nested_get_page(vcpu, vmx->nested.current_vmptr);
+	struct nested_vmcs_page *mapped_page;
+
+	if (vmcs_page == NULL) {
+		printk(KERN_INFO "%s: failure in nested_get_page\n", __func__);
+		return 0;
+	}
+
+	if (vmx->nested.current_l2_page) {
+		printk(KERN_INFO "%s: shadow vmcs already mapped\n", __func__);
+		WARN_ON(1);
+		return 0;
+	}
+
+	mapped_page = kmap_atomic(vmcs_page, KM_USER0);
+
+	if (!mapped_page) {
+		printk(KERN_INFO "%s: error in kmap_atomic\n", __func__);
+		return 0;
+	}
   

kmap_atomic() can't fail.

+
+	vmx->nested.current_l2_page = mapped_page;
+
+	return 1;
+}
+
+static void nested_unmap_current(struct kvm_vcpu *vcpu)
+{
+	struct page *page;
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	if (!vmx->nested.current_l2_page) {
+		printk(KERN_INFO "Shadow vmcs already unmapped\n");
+		WARN_ON(1);
   

Use BUG_ON(), since this can't happen unless there's a bug.

+		return;
+	}
+
+	page = kmap_atomic_to_page(vmx->nested.current_l2_page);
+
+	kunmap_atomic(vmx->nested.current_l2_page, KM_USER0);
+
+	kvm_release_page_dirty(page);
+
+	vmx->nested.current_l2_page = NULL;
+}
+
  static int init_rmode(struct kvm *kvm);
  static u64 construct_eptp(unsigned long root_hpa);

@@ -1144,6 +1240,35 @@ static int nested_vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
  	return 0;
  }

+static int read_guest_vmcs_gpa(struct kvm_vcpu *vcpu, gva_t gva, u64 *gentry)
+{
+	int r = 0;
+	uint size;
+
+	*gentry = 0;
+
+	if (is_long_mode(vcpu))
+		size = sizeof(u64);
+	else
+		size = sizeof(u32);
   

I think the gpa is always 64 bit, regardless of the current mode.

+
+	r = kvm_read_guest_virt(gva, gentry,
+				size, vcpu);
+	if (r) {
+		printk(KERN_ERR "%s cannot read guest vmcs addr %lx : %d\n",
+		       __func__, vcpu->arch.regs[VCPU_REGS_RAX], r);
   

RAX may not be relevant.  Just return, and the user can disassemble the 
instructions and see for themselves.

+		return r;
+	}
+
+	if (!IS_ALIGNED(*gentry, PAGE_SIZE)) {
+		printk(KERN_DEBUG "%s addr %llx not aligned\n",
+		       __func__, *gentry);
+		return 1;
+	}
+
+	return 0;
+}
+

+/*
+ * Decode the memory address (operand) of a vmx instruction according to Table 23-12/23-11
+ * For additional information regarding offset calculation see 3.7.5
+ */
+static gva_t get_vmx_mem_address(struct kvm_vcpu *vcpu,
+				 unsigned long exit_qualification,
+				 u32 vmx_instruction_info)
+{
+	int  scaling        = vmx_instruction_info&  3;             /* bits 0:1 scaling */
+	int  addr_size      = (vmx_instruction_info>>  7)&  7;      /* bits 7:9 address size, 0=16bit, 1=32bit, 2=64bit */
+	bool is_reg         = vmx_instruction_info&  (1u<<  10);    /* bit  10  1=register operand, 0= memory */
+	int  seg_reg        = (vmx_instruction_info>>  15)&  7;     /* bits 15:17 segment register */
+	int  index_reg      = (vmx_instruction_info>>  18)&  0xf;   /* bits 18:21 index register */
+	bool index_is_valid = !(vmx_instruction_info&  (1u<<  22)); /* bit  22 index register validity, 0=valid, 1=invalid */
+	int  base_reg       = (vmx_instruction_info>>  23)&  0xf;   /* bits 23:26 index register */
+	bool base_is_valid  = !(vmx_instruction_info&  (1u<<  27)); /* bit  27 base register validity, 0=valid, 1=invalid */
+	gva_t addr;
+
+	if (is_reg)
+		return 0;
   

Should #UD.

+
+	switch (addr_size) {
+	case 1:
+		exit_qualification&= 0xffffffff; /* 32 high bits are undefied according to the spec, page 23-7 */
+		break;
+	case 2:
+		break;
+	default:
+		return 0;
+	}
+
+	/* Addr = segment_base + offset */
+	/* offfset = Base + [Index * Scale] + Displacement, see Figure 3-11 */
+	addr = vmx_get_segment_base(vcpu, seg_reg);
+	if (base_is_valid)
+		addr += kvm_register_read(vcpu, base_reg);
+	if (index_is_valid)
+		addr += kvm_register_read(vcpu, index_reg)*scaling;
   

Shouldn't this be a shift?

Wish we had something like that for emulate.c.

+	addr += exit_qualification; /* exit qualification holds the displacement, spec page 23-7 */
+
+	return addr;
+}
+
+static int handle_vmclear(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	struct level_state *l2_state;
+	gpa_t guest_vmcs_addr;
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	gva_t vmcs_gva;
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmcs_gva = get_vmx_mem_address(vcpu, exit_qualification,
+				       vmx_instruction_info);
   

I think you can let get_vmx_mem_address() do the vmread()s, simpler.

+
+	if (read_guest_vmcs_gpa(vcpu, vmcs_gva,&guest_vmcs_addr))
+		return 1;
+
+	vmx->nested.current_vmptr = guest_vmcs_addr;
+	if (!nested_map_current(vcpu))
+		return 1;
+
+	l2_state =&(to_vmx(vcpu)->nested.current_l2_page->l2_state);
+	l2_state->vmclear = 1;
+	nested_free_current_vmcs(vcpu);
   

Why free?  Isn't the purpose of the list to keep those active?

+
+	vmx->nested.current_vmptr = -1ull;
+
+	nested_unmap_current(vcpu);
+
+	skip_emulated_instruction(vcpu);
+	clear_rflags_cf_zf(vcpu);
+
+	return 1;
+}
+
   

As usual, if you can split some of the infrastructure into separate 
patches, it would help review.

--
error compiling committee.c: too many arguments to function

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html