Re: [PATCH 4/8] KVM: VMX: Adjust the TSC-related VMCS fields on L2 entry and exit

Maxim Levitsky <mlevitsk@xxxxxxxxxx> · Mon, 10 May 2021 16:53:38 +0300

On Thu, 2021-05-06 at 10:32 +0000, ilstam@xxxxxxxxxxx wrote:
> From: Ilias Stamatis <ilstam@xxxxxxxxxx>
> 
> When L2 is entered we need to "merge" the TSC multiplier and TSC offset
> values of VMCS01 and VMCS12 and store the result into the current
> VMCS02.
> 
> The 02 values are calculated using the following equations:
>   offset_02 = ((offset_01 * mult_12) >> 48) + offset_12
>   mult_02 = (mult_01 * mult_12) >> 48

I would mention that 48 is kvm_tsc_scaling_ratio_frac_bits instead.
Also maybe add the common code in a separate patch?

> 
> Signed-off-by: Ilias Stamatis <ilstam@xxxxxxxxxx>
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/vmx/nested.c       | 26 ++++++++++++++++++++++----
>  arch/x86/kvm/x86.c              | 25 +++++++++++++++++++++++++
>  3 files changed, 48 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index cdddbf0b1177..e7a1eb36f95a 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1780,6 +1780,7 @@ void kvm_define_user_return_msr(unsigned index, u32 msr);
>  int kvm_set_user_return_msr(unsigned index, u64 val, u64 mask);
>  
>  u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc, bool l1);
> +u64 kvm_compute_02_tsc_offset(u64 l1_offset, u64 l2_multiplier, u64 l2_offset);
>  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc);
>  
>  unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu);
> diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
> index bced76637823..a1bf28f33837 100644
> --- a/arch/x86/kvm/vmx/nested.c
> +++ b/arch/x86/kvm/vmx/nested.c
> @@ -3353,8 +3353,22 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
>  	}
>  
>  	enter_guest_mode(vcpu);
> -	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
> -		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
> +
> +	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) {
> +		if (vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_TSC_SCALING) {
> +			vcpu->arch.tsc_offset = kvm_compute_02_tsc_offset(
> +					vcpu->arch.l1_tsc_offset,
> +					vmcs12->tsc_multiplier,
> +					vmcs12->tsc_offset);
> +
> +			vcpu->arch.tsc_scaling_ratio = mul_u64_u64_shr(
> +					vcpu->arch.tsc_scaling_ratio,
> +					vmcs12->tsc_multiplier,
> +					kvm_tsc_scaling_ratio_frac_bits);
> +		} else {
> +			vcpu->arch.tsc_offset += vmcs12->tsc_offset;
> +		}
> +	}
>  
>  	if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
>  		exit_reason.basic = EXIT_REASON_INVALID_STATE;
> @@ -4454,8 +4468,12 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
>  	if (nested_cpu_has_preemption_timer(vmcs12))
>  		hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
>  
> -	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
> -		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
> +	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING) {
> +		vcpu->arch.tsc_offset = vcpu->arch.l1_tsc_offset;
> +
> +		if (vmcs12->secondary_vm_exec_control & SECONDARY_EXEC_TSC_SCALING)
> +			vcpu->arch.tsc_scaling_ratio = vcpu->arch.l1_tsc_scaling_ratio;
> +	}
>  
>  	if (likely(!vmx->fail)) {
>  		sync_vmcs02_to_vmcs12(vcpu, vmcs12);
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 26a4c0f46f15..87deb119c521 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2266,6 +2266,31 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
>  	return target_tsc - tsc;
>  }
>  
> +/*
> + * This function computes the TSC offset that is stored in VMCS02 when entering
> + * L2 by combining the offset and multiplier values of VMCS01 and VMCS12.
> + */
> +u64 kvm_compute_02_tsc_offset(u64 l1_offset, u64 l2_multiplier, u64 l2_offset)
> +{
> +	u64 offset;
> +
> +	/*
> +	 * The L1 offset is interpreted as a signed number by the CPU and can
> +	 * be negative. So we extract the sign before the multiplication and
> +	 * put it back afterwards if needed.
If I understand correctly the reason for sign extraction is that we don't
have mul_s64_u64_shr. Maybe we should add it?

The pattern of (A * B) >> shift appears many times in TSC scaling.

So instead of this function maybe just use something like that?

merged_offset = l2_offset + mul_s64_u64_shr ((s64) l1_offset, 
                                             l2_multiplier, 
                                             kvm_tsc_scaling_ratio_frac_bits)

Or another idea:

How about

u64 __kvm_scale_tsc_value(u64 value, u64 multiplier) {
	return mul_u64_u64_shr(value, 
			       multiplier, 
			       kvm_tsc_scaling_ratio_frac_bits);
}

and

s64 __kvm_scale_tsc_offset(u64 value, u64 multiplier)
{
	return mul_s64_u64_shr((s64)value,
			       multiplier, 
			       kvm_tsc_scaling_ratio_frac_bits);
}

And then use them in the code.

Overall though the code *looks* correct to me
but I might have missed something.

Best regards,
	Maxim Levitsky

> +	 */
> +	offset = mul_u64_u64_shr(abs((s64) l1_offset),
> +				 l2_multiplier,
> +				 kvm_tsc_scaling_ratio_frac_bits);
> +
> +	if ((s64) l1_offset < 0)
> +		offset = -((s64) offset);
> +
> +	offset += l2_offset;
> +	return offset;
> +}
> +EXPORT_SYMBOL_GPL(kvm_compute_02_tsc_offset);
> +
>  u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
>  {
>  	return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc, true);