Attached patch is modified according your comments to use 4 msr bitmaps. Please help to review it. Also, is there any comments for the other part of this series? If yes, welcome. :) Gleb Natapov wrote on 2013-01-14: > On Mon, Jan 14, 2013 at 11:25:45AM +0000, Zhang, Yang Z wrote: >> Gleb Natapov wrote on 2013-01-14: >>> On Mon, Jan 14, 2013 at 11:10:26AM +0000, Zhang, Yang Z wrote: >>>> Gleb Natapov wrote on 2013-01-14: >>>>> On Mon, Jan 14, 2013 at 11:01:02AM +0000, Zhang, Yang Z wrote: >>>>>> Gleb Natapov wrote on 2013-01-14: >>>>>>> On Mon, Jan 14, 2013 at 03:13:34PM +0800, Yang Zhang wrote: >>>>>>>> From: Yang Zhang <yang.z.zhang@xxxxxxxxx> >>>>>>>> >>>>>>>> basically to benefit from apicv, we need to enable virtualized >>>>>>>> x2apic mode. Currently, we only enable it when guest is really >>>>>>>> using x2apic. >>>>>>>> >>>>>>>> Also, clear MSR bitmap for corresponding x2apic MSRs when guest >>> enabled >>>>>>> x2apic: >>>>>>>> 0x800 - 0x8ff: no read intercept for apicv register virtualization, >>>>>>>> except APIC ID and TMCCT. >>>>>>>> APIC ID and TMCCT: need software's assistance to get right value. >>>>>>> Actually since msr bitmap is shared between all vcpus this will break >>>>>>> guests that do not enable x2apic. >>>>>> I don't think this case will exist. It will break the real OS too. >>>>>> >>>>> Which case? One VM uses x2apic another one does not? Bitmap is shared >>>>> between all vcpus of all VMs. >>>> Sorry. I misread your comments. >>>> >>> I miswrote it. Forgot to include "of all VMs" there. >>> >>>> Yes, it is really a problem. Maybe we need to use per VM msr bitmap instead >>> global bitmap. >>> Are you sure cpus cannot be in different modes during boot and smp >>> initialization? Where spec says that? >> I don't think hardware has this limitation. I mean use per vcpu's msr >> bitmap instead global bitmap. >> > Even if HW has such limitation we cannot have per VM bitmap since > malicious guest can abuse it. > > Per cpu msr bitmap means that each vcpu will waste one more page of > memory. We already have different global msr bitmap for long mode > and legacy mode, may be make it 4: x2apic X long X legacy. > >>>> >>>>>>>> Signed-off-by: Kevin Tian <kevin.tian@xxxxxxxxx> >>>>>>>> Signed-off-by: Yang Zhang <yang.z.zhang@xxxxxxxxx> >>>>>>>> --- >>>>>>>> arch/x86/include/asm/kvm_host.h | 1 + >>> arch/x86/include/asm/vmx.h >>>>>>>> | 1 + arch/x86/kvm/lapic.c | 15 +++- >>>>>>>> arch/x86/kvm/svm.c | 6 ++ arch/x86/kvm/vmx.c >>>>>>>> | 162 +++++++++++++++++++++++++++++++++++++-- 5 files >>>>> changed, 173 >>>>>>>> insertions(+), 12 deletions(-) >>>>>>>> diff --git a/arch/x86/include/asm/kvm_host.h >>>>>>>> b/arch/x86/include/asm/kvm_host.h index c431b33..35aa8e6 100644 >>>>>>>> --- a/arch/x86/include/asm/kvm_host.h +++ >>>>>>>> b/arch/x86/include/asm/kvm_host.h @@ -697,6 +697,7 @@ struct >>>>>>>> kvm_x86_ops { >>>>>>>> void (*enable_nmi_window)(struct kvm_vcpu *vcpu); void >>>>>>>> (*enable_irq_window)(struct kvm_vcpu *vcpu); void >>>>>>>> (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int >>>>>>>> irr); + void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, >>>>>>>> bool set); int (*set_tss_addr)(struct kvm *kvm, unsigned int >>>>>>>> addr); int (*get_tdp_level)(void); u64 (*get_mt_mask)(struct >>>>>>>> kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); >>>>>>>> diff --git a/arch/x86/include/asm/vmx.h >>>>>>>> b/arch/x86/include/asm/vmx.h index 44c3f7e..0a54df0 100644 --- >>>>>>>> a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ >>>>>>>> -139,6 +139,7 @@ >>>>>>>> #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 >>>>>>>> #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 >>>>>>>> #define SECONDARY_EXEC_RDTSCP 0x00000008 +#define >>>>>>>> SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define >>>>>>>> SECONDARY_EXEC_ENABLE_VPID 0x00000020 #define >>>>>>>> SECONDARY_EXEC_WBINVD_EXITING 0x00000040 #define >>>>>>>> SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 >>>>>>>> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c >>>>>>>> index 0664c13..2ef5e2b 100644 >>>>>>>> --- a/arch/x86/kvm/lapic.c >>>>>>>> +++ b/arch/x86/kvm/lapic.c >>>>>>>> @@ -1323,12 +1323,17 @@ void kvm_lapic_set_base(struct kvm_vcpu >>>>> *vcpu, >>>>>>> u64 value) >>>>>>>> if (!kvm_vcpu_is_bsp(apic->vcpu)) >>>>>>>> value &= ~MSR_IA32_APICBASE_BSP; >>>>>>>> - vcpu->arch.apic_base = value; >>>>>>>> - if (apic_x2apic_mode(apic)) { >>>>>>>> - u32 id = kvm_apic_id(apic); >>>>>>>> - u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); >>>>>>>> - kvm_apic_set_ldr(apic, ldr); >>>>>>>> + if ((vcpu->arch.apic_base ^ value) & X2APIC_ENABLE) { >>>>>>>> + if (value & X2APIC_ENABLE) { >>>>>>>> + u32 id = kvm_apic_id(apic); >>>>>>>> + u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf)); >>>>>>>> + kvm_apic_set_ldr(apic, ldr); >>>>>>>> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, true); >>>>>>>> + } else >>>>>>>> + kvm_x86_ops->set_virtual_x2apic_mode(vcpu, false); >>>>>>>> } >>>>>>>> + >>>>>>>> + vcpu->arch.apic_base = value; >>>>>>>> apic->base_address = apic->vcpu->arch.apic_base & >>>>>>>> MSR_IA32_APICBASE_BASE; >>>>>>>> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c >>>>>>>> index d29d3cd..38407e9 100644 >>>>>>>> --- a/arch/x86/kvm/svm.c >>>>>>>> +++ b/arch/x86/kvm/svm.c >>>>>>>> @@ -3571,6 +3571,11 @@ static void update_cr8_intercept(struct >>>>> kvm_vcpu >>>>>>> *vcpu, int tpr, int irr) >>>>>>>> set_cr_intercept(svm, INTERCEPT_CR8_WRITE); >>>>>>>> } >>>>>>>> +static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, >>>>>>>> bool set) +{ + return; +} + >>>>>>>> static int svm_nmi_allowed(struct kvm_vcpu *vcpu) { struct >>>>>>>> vcpu_svm *svm = to_svm(vcpu); @@ -4290,6 +4295,7 @@ static >>>>>>>> struct kvm_x86_ops svm_x86_ops = { .enable_nmi_window = >>>>>>>> enable_nmi_window, .enable_irq_window = enable_irq_window, >>>>>>>> .update_cr8_intercept = update_cr8_intercept, >>>>>>>> + .set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode, >>>>>>>> >>>>>>>> .set_tss_addr = svm_set_tss_addr, >>>>>>>> .get_tdp_level = get_npt_level, >>>>>>>> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c >>>>>>>> index 0403634..847022e 100644 >>>>>>>> --- a/arch/x86/kvm/vmx.c >>>>>>>> +++ b/arch/x86/kvm/vmx.c >>>>>>>> @@ -767,6 +767,12 @@ static inline bool >>>>>>> cpu_has_vmx_virtualize_apic_accesses(void) >>>>>>>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; >>>>>>>> } >>>>>>>> +static inline bool cpu_has_vmx_virtualize_x2apic_mode(void) >>>>>>>> +{ >>>>>>>> + return vmcs_config.cpu_based_2nd_exec_ctrl & >>>>>>>> + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; >>>>>>>> +} >>>>>>>> + >>>>>>>> static inline bool cpu_has_vmx_apic_register_virt(void) >>>>>>>> { >>>>>>>> return vmcs_config.cpu_based_2nd_exec_ctrl & >>>>>>>> @@ -2543,6 +2549,7 @@ static __init int setup_vmcs_config(struct >>>>>>> vmcs_config *vmcs_conf) >>>>>>>> if (_cpu_based_exec_control & >>>>>>>> CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) { min2 = 0; opt2 = >>>>>>>> SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | >>>>>>>> + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | >>>>>>>> SECONDARY_EXEC_WBINVD_EXITING | SECONDARY_EXEC_ENABLE_VPID | >>>>>>>> SECONDARY_EXEC_ENABLE_EPT | @@ -3724,7 +3731,45 @@ static >>>>>>>> void free_vpid(struct vcpu_vmx *vmx) > spin_unlock(&vmx_vpid_lock); } >>>>>>>> -static void __vmx_disable_intercept_for_msr(unsigned long >>>>>>>> *msr_bitmap, u32 msr) +#define MSR_TYPE_R 1 +#define MSR_TYPE_W 2 >>>>>>>> +static void __vmx_disable_intercept_for_msr(unsigned long >>>>>>>> *msr_bitmap, + u32 msr, int type) +{ + int f = >>>>>>>> sizeof(unsigned long); + + if (!cpu_has_vmx_msr_bitmap()) >>>>>>>> + return; + + /* + * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap >>>>>>>> Address). Early manuals + >>>>>>>> >>>>>>>> * have the write-low and read-high bitmap offsets the wrong way >>>>>>>> round. + * We can control MSRs 0x00000000-0x00001fff and >>>>>>>> 0xc0000000-0xc0001fff. + */ + if (msr <= 0x1fff) { + if (type & >>>>>>>> MSR_TYPE_R) + /* read-low */ + __clear_bit(msr, msr_bitmap + >>>>>>>> 0x000 / f); + + if (type & MSR_TYPE_W) + /* write-low */ + >>>>>>>> __clear_bit(msr, msr_bitmap + 0x800 / f); + + } else if ((msr >= >>>>>>>> 0xc0000000) && (msr <= 0xc0001fff)) { + msr &= 0x1fff; + if >>>>>>>> (type & MSR_TYPE_R) + /* read-high */ + __clear_bit(msr, >>>>>>>> msr_bitmap + 0x400 / f); + + if (type & MSR_TYPE_W) + /* >>>>>>>> write-high */ + __clear_bit(msr, msr_bitmap + 0xc00 / f); + + } >>>>>>>> +} + +static void __vmx_enable_intercept_for_msr(unsigned long >>>>>>>> *msr_bitmap, + u32 msr, int type) >>>>>>>> { >>>>>>>> int f = sizeof(unsigned long); >>>>>>>> @@ -3737,20 +3782,75 @@ static void >>>>>>> __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 > msr) >>>>>>>> * We can control MSRs 0x00000000-0x00001fff and >>>>>>>> 0xc0000000-0xc0001fff. */ if (msr <= 0x1fff) { >>>>>>>> - __clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */ >>>>>>>> - __clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */ >>>>>>>> + if (type & MSR_TYPE_R) >>>>>>>> + /* read-low */ >>>>>>>> + __set_bit(msr, msr_bitmap + 0x000 / f); >>>>>>>> + >>>>>>>> + if (type & MSR_TYPE_W) >>>>>>>> + /* write-low */ >>>>>>>> + __set_bit(msr, msr_bitmap + 0x800 / f); >>>>>>>> + >>>>>>>> } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { >>>>>>>> msr &= 0x1fff; >>>>>>>> - __clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */ >>>>>>>> - __clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */ >>>>>>>> + if (type & MSR_TYPE_R) >>>>>>>> + /* read-high */ >>>>>>>> + __set_bit(msr, msr_bitmap + 0x400 / f); >>>>>>>> + >>>>>>>> + if (type & MSR_TYPE_W) >>>>>>>> + /* write-high */ >>>>>>>> + __set_bit(msr, msr_bitmap + 0xc00 / f); >>>>>>>> + >>>>>>>> } } + static void vmx_disable_intercept_for_msr(u32 msr, bool >>>>>>>> longmode_only) { if (!longmode_only) >>>>>>>> - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr); >>>>>>>> - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr); >>>>>>>> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + >>>>>>>> msr, MSR_TYPE_R | MSR_TYPE_W); >>>>>>>> + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, >>>>>>>> + msr, MSR_TYPE_R | MSR_TYPE_W); +} + +static void >>>>>>>> vmx_intercept_for_msr_read(u32 msr, bool longmode_only, + bool >>>>>>>> set) +{ + if (!longmode_only) { + if (set) + >>>>>>>> __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, + msr, >>>>>>>> MSR_TYPE_R); + else + >>>>>>>> __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + >>>>>>>> msr, MSR_TYPE_R); + + } + if (set) + >>>>>>>> __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, >>>>>>>> + msr, MSR_TYPE_R); + else + >>>>>>>> __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, >>>>>>>> + msr, MSR_TYPE_R); +} + +static void >>>>>>>> vmx_intercept_for_msr_write(u32 msr, bool longmode_only, + >>>>>>>> bool set) +{ + if (!longmode_only) { + if (set) + >>>>>>>> __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy, + >>>>>>>> msr, MSR_TYPE_W); + else + >>>>>>>> __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, + >>>>>>>> msr, MSR_TYPE_W); + + } + if (set) + >>>>>>>> __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode, >>>>>>>> + msr, MSR_TYPE_W); + else + >>>>>>>> __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, >>>>>>>> + msr, MSR_TYPE_W); >>>>>>>> } >>>>>>>> >>>>>>>> /* >>>>>>>> @@ -3848,6 +3948,7 @@ static u32 > vmx_secondary_exec_control(struct >>>>>>> vcpu_vmx *vmx) >>>>>>>> exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING; if >>>>>>>> (!enable_apicv_reg) exec_control &= >>>>>>>> ~SECONDARY_EXEC_APIC_REGISTER_VIRT; + exec_control &= >>>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; return >>>>> exec_control; } >>>>>>>> @@ -6103,6 +6204,52 @@ static void update_cr8_intercept(struct >>>>> kvm_vcpu >>>>>>> *vcpu, int tpr, int irr) >>>>>>>> vmcs_write32(TPR_THRESHOLD, irr); >>>>>>>> } >>>>>>>> +static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, >>>>>>>> bool set) +{ + u32 exec_control, sec_exec_control; + int msr; >>>>>>>> + struct vcpu_vmx *vmx = to_vmx(vcpu); + + /* There is not point >>>>>>>> to enable virtualize x2apic without enable + * apicv*/ + if >>>>>>>> (!cpu_has_vmx_virtualize_x2apic_mode() || !enable_apicv_reg) + >>>>>>>> return; + + if (set) { + exec_control = >>>>>>>> vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + /* virtualize x2apic >>>>>>>> mode relies on tpr shadow */ + if (!(exec_control & >>>>>>>> CPU_BASED_TPR_SHADOW)) + return; + } + + sec_exec_control = >>>>>>>> vmcs_read32(SECONDARY_VM_EXEC_CONTROL); + + if (set) { >>>>>>>> + sec_exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; >>>>>>>> + sec_exec_control |= SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + } >>>>>>>> else { + sec_exec_control &= >>>>>>>> ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE; + if >>>>>>>> (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm)) + >>>>>>>> sec_exec_control |= + SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; >>>>>>>> + } + vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control); >>>>>>>> + + for (msr = 0x800; msr <= 0x8ff; msr++) + >>>>>>>> vmx_intercept_for_msr_read(msr, false, !set); + + if (set) { + >>>>>>>> /* According SDM, in x2apic mode, the whole id reg is used. + >>>>>>>> * But in KVM, it only use the highest eight bits. Need to + * >>>>>>>> intercept it*/ + vmx_intercept_for_msr_read(0x802, false, true); >>>>>>>> + /* TMCCT */ + vmx_intercept_for_msr_read(0x839, false, true); >>>>>>>> + } + /* TPR */ + vmx_intercept_for_msr_write(0x808, false, >>>>>>>> !set); +} + >>>>>>>> static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { >>>>>>>> u32 exit_intr_info; @@ -7366,6 +7513,7 @@ static struct >>>>>>>> kvm_x86_ops vmx_x86_ops = { .enable_nmi_window = >>>>>>>> enable_nmi_window, .enable_irq_window = enable_irq_window, >>>>>>>> .update_cr8_intercept = update_cr8_intercept, >>>>>>>> + .set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode, >>>>>>>> >>>>>>>> .set_tss_addr = vmx_set_tss_addr, >>>>>>>> .get_tdp_level = get_ept_level, >>>>>>>> -- >>>>>>>> 1.7.1 >>>>>>> >>>>>>> -- >>>>>>> Gleb. >>>>>> >>>>>> >>>>>> Best regards, >>>>>> Yang >>>>> >>>>> -- >>>>> Gleb. >>>> >>>> >>>> Best regards, >>>> Yang >>>> >>> >>> -- >>> Gleb. >> >> >> Best regards, >> Yang >> > > -- > Gleb. Best regards, Yang
Attachment:
x2apic_revise.patch
Description: x2apic_revise.patch