As mentioned in the linux kernel development document, "sched_clock() is used for scheduling and timestamping". While there is a default native implementation, many paravirtualizations have their own implementations. About KVM, it uses kvm_sched_clock_read() and there is no way to only disable KVM's sched_clock. The "no-kvmclock" may disable all paravirtualized kvmclock features. 94 static inline void kvm_sched_clock_init(bool stable) 95 { 96 if (!stable) 97 clear_sched_clock_stable(); 98 kvm_sched_clock_offset = kvm_clock_read(); 99 paravirt_set_sched_clock(kvm_sched_clock_read); 100 101 pr_info("kvm-clock: using sched offset of %llu cycles", 102 kvm_sched_clock_offset); 103 104 BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > 105 sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); 106 } There is known issue that kvmclock may drift during vCPU hotplug [1]. Although a temporary fix is available [2], we may need a way to disable pv sched_clock. Nowadays, the TSC is more stable and has less performance overhead than kvmclock. This is to propose to introduce a global param to disable pv sched_clock for all paravirtualizations. Please suggest and comment if other options are better: 1. Global param (this RFC patch). 2. The kvmclock specific param (e.g., "no-vmw-sched-clock" in vmware). Indeed I like the 2nd method. 3. Enforce native sched_clock only when TSC is invariant (hyper-v method). 4. Remove and cleanup pv sched_clock, and always use pv_sched_clock() for all (suggested by Peter Zijlstra in [3]). Some paravirtualizations may want to keep the pv sched_clock. To introduce a param may be easier to backport to old kernel version. References: [1] https://lore.kernel.org/all/20230926230649.67852-1-dongli.zhang@xxxxxxxxxx/ [2] https://lore.kernel.org/all/20231018195638.1898375-1-seanjc@xxxxxxxxxx/ [3] https://lore.kernel.org/all/20231002211651.GA3774@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/ Thank you very much for the suggestion! Signed-off-by: Dongli Zhang <dongli.zhang@xxxxxxxxxx> --- arch/x86/include/asm/paravirt.h | 2 +- arch/x86/kernel/kvmclock.c | 12 +++++++----- arch/x86/kernel/paravirt.c | 18 +++++++++++++++++- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 6c8ff12140ae..f36edf608b6b 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -24,7 +24,7 @@ u64 dummy_sched_clock(void); DECLARE_STATIC_CALL(pv_steal_clock, dummy_steal_clock); DECLARE_STATIC_CALL(pv_sched_clock, dummy_sched_clock); -void paravirt_set_sched_clock(u64 (*func)(void)); +int paravirt_set_sched_clock(u64 (*func)(void)); static __always_inline u64 paravirt_sched_clock(void) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index fb8f52149be9..0b8bf5677d44 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -93,13 +93,15 @@ static noinstr u64 kvm_sched_clock_read(void) static inline void kvm_sched_clock_init(bool stable) { - if (!stable) - clear_sched_clock_stable(); kvm_sched_clock_offset = kvm_clock_read(); - paravirt_set_sched_clock(kvm_sched_clock_read); - pr_info("kvm-clock: using sched offset of %llu cycles", - kvm_sched_clock_offset); + if (!paravirt_set_sched_clock(kvm_sched_clock_read)) { + if (!stable) + clear_sched_clock_stable(); + + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); + } BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 97f1436c1a20..2cfef94317b0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -118,9 +118,25 @@ static u64 native_steal_clock(int cpu) DEFINE_STATIC_CALL(pv_steal_clock, native_steal_clock); DEFINE_STATIC_CALL(pv_sched_clock, native_sched_clock); -void paravirt_set_sched_clock(u64 (*func)(void)) +static bool no_pv_sched_clock; + +static int __init parse_no_pv_sched_clock(char *arg) +{ + no_pv_sched_clock = true; + return 0; +} +early_param("no_pv_sched_clock", parse_no_pv_sched_clock); + +int paravirt_set_sched_clock(u64 (*func)(void)) { + if (no_pv_sched_clock) { + pr_info("sched_clock: not configurable\n"); + return -EPERM; + } + static_call_update(pv_sched_clock, func); + + return 0; } /* These are in entry.S */ -- 2.34.1