On Thu, 8 Apr 2021 at 21:19, Thomas Gleixner <tglx@xxxxxxxxxxxxx> wrote: > > On Tue, Apr 06 2021 at 21:47, Sean Christopherson wrote: > > On Tue, Apr 06, 2021, Michael Tokarev wrote: > >> broke kvm guest cpu time accounting - after this commit, when running > >> qemu-system-x86_64 -enable-kvm, the guest time (in /proc/stat and > >> elsewhere) is always 0. > >> > >> I dunno why it happened, but it happened, and all kernels after 5.9 > >> are affected by this. > >> > >> This commit is found in a (painful) git bisect between kernel 5.8 and 5.10. > > > > Yes :-( > > > > There's a bugzilla[1] and two proposed fixes[2][3]. I don't particularly like > > either of the fixes, but an elegant solution hasn't presented itself. > > > > Thomas/Paolo, can you please weigh in? > > > > [1] https://bugzilla.kernel.org/show_bug.cgi?id=209831 > > [2] https://lkml.kernel.org/r/1617011036-11734-1-git-send-email-wanpengli@xxxxxxxxxxx > > [3] https://lkml.kernel.org/r/20210206004218.312023-1-seanjc@xxxxxxxxxx > > All of the solutions I looked at so far are ugly as hell. The problem is > that the accounting is plumbed into the context tracking and moving > context tracking around to a different place is just wrong. > > I think the right solution is to seperate the time accounting logic out > from guest_enter/exit_irqoff() and have virt time specific helpers which > can be placed at the proper spots in kvm. For x86 part, how about something like below: diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 48b396f3..7aeb724 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -3730,6 +3730,7 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) lockdep_hardirqs_on_prepare(CALLER_ADDR0); instrumentation_end(); + account_guest_enter(); guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -3759,6 +3760,8 @@ static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu) */ lockdep_hardirqs_off(CALLER_ADDR0); guest_exit_irqoff(); + if (vtime_accounting_enabled_this_cpu()) + account_guest_exit(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index c05e6e2..5f6c30c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6617,6 +6617,7 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, lockdep_hardirqs_on_prepare(CALLER_ADDR0); instrumentation_end(); + account_guest_enter(); guest_enter_irqoff(); lockdep_hardirqs_on(CALLER_ADDR0); @@ -6648,6 +6649,8 @@ static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu, */ lockdep_hardirqs_off(CALLER_ADDR0); guest_exit_irqoff(); + if (vtime_accounting_enabled_this_cpu()) + account_guest_exit(); instrumentation_begin(); trace_hardirqs_off_finish(); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 16fb395..33422c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -9229,6 +9229,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) ++vcpu->stat.exits; local_irq_disable(); kvm_after_interrupt(vcpu); + if (!vtime_accounting_enabled_this_cpu()) + account_guest_exit(); if (lapic_in_kernel(vcpu)) { s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta; diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index bceb064..ff70229 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -104,8 +104,7 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -/* must be called with irqs disabled */ -static __always_inline void guest_enter_irqoff(void) +static __always_inline void account_guest_enter(void) { instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) @@ -113,7 +112,11 @@ static __always_inline void guest_enter_irqoff(void) else current->flags |= PF_VCPU; instrumentation_end(); +} +/* must be called with irqs disabled */ +static __always_inline void guest_enter_irqoff(void) +{ if (context_tracking_enabled()) __context_tracking_enter(CONTEXT_GUEST); @@ -131,11 +134,8 @@ static __always_inline void guest_enter_irqoff(void) } } -static __always_inline void guest_exit_irqoff(void) +static __always_inline void account_guest_exit(void) { - if (context_tracking_enabled()) - __context_tracking_exit(CONTEXT_GUEST); - instrumentation_begin(); if (vtime_accounting_enabled_this_cpu()) vtime_guest_exit(current); @@ -144,8 +144,14 @@ static __always_inline void guest_exit_irqoff(void) instrumentation_end(); } +static __always_inline void guest_exit_irqoff(void) +{ + if (context_tracking_enabled()) + __context_tracking_exit(CONTEXT_GUEST); +} + #else -static __always_inline void guest_enter_irqoff(void) +static __always_inline void account_guest_enter(void) { /* * This is running in ioctl context so its safe @@ -155,11 +161,17 @@ static __always_inline void guest_enter_irqoff(void) instrumentation_begin(); vtime_account_kernel(current); current->flags |= PF_VCPU; + instrumentation_end(); +} + +static __always_inline void guest_enter_irqoff(void) +{ + instrumentation_begin(); rcu_virt_note_context_switch(smp_processor_id()); instrumentation_end(); } -static __always_inline void guest_exit_irqoff(void) +static __always_inline void account_guest_exit(void) { instrumentation_begin(); /* Flush the guest cputime we spent on the guest */ @@ -167,6 +179,11 @@ static __always_inline void guest_exit_irqoff(void) current->flags &= ~PF_VCPU; instrumentation_end(); } + +static __always_inline void guest_exit_irqoff(void) +{ + +} #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void guest_exit(void)