Under certain extreme conditions, the tick-based cputime accounting may produce inaccurate data. For instance, guest CPU usage is sensitive to interrupts firing right before the tick's expiration. This forces the guest into kernel context, and has that time slice wrongly accounted as system time. This issue is exacerbated if the interrupt source is in sync with the tick, significantly skewing usage metrics towards system time. On CPUs with full dynticks enabled, cputime accounting leverages the context tracking subsystem to measure usage, and isn't susceptible to this sort of race conditions. However, this imposes a bigger overhead, including additional accounting and the extra dyntick tracking during user<->kernel<->guest transitions (RmW + mb). So, in order to get the best of both worlds, introduce a cputime configuration option that allows using the full dynticks accounting scheme on NOHZ & NOHZ_IDLE CPUs, while avoiding the expensive user<->kernel<->guest dyntick transitions. Signed-off-by: Nicolas Saenz Julienne <nsaenz@xxxxxxxxxx> Signed-off-by: Jack Allister <jalliste@xxxxxxxxxxxx> --- NOTE: This wasn't tested in depth, and it's mostly intended to highlight the issue we're trying to solve. Also ccing KVM folks, since it's relevant to guest CPU usage accounting. include/linux/context_tracking.h | 4 ++-- include/linux/vtime.h | 6 ++++-- init/Kconfig | 24 +++++++++++++++++++++++- kernel/context_tracking.c | 25 ++++++++++++++++++++++--- 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 6e76b9dba00e7..dd9b500359aa6 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -102,11 +102,11 @@ static __always_inline void context_tracking_guest_exit(void) { } #define CT_WARN_ON(cond) do { } while (0) #endif /* !CONFIG_CONTEXT_TRACKING_USER */ -#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE +#if defined(CONFIG_CONTEXT_TRACKING_USER_FORCE) || defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) extern void context_tracking_init(void); #else static inline void context_tracking_init(void) { } -#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE */ +#endif /* CONFIG_CONTEXT_TRACKING_USER_FORCE || CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE */ #ifdef CONFIG_CONTEXT_TRACKING_IDLE extern void ct_idle_enter(void); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 3684487d01e1c..d78d01eead6e9 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -79,12 +79,14 @@ static inline bool vtime_accounting_enabled(void) static inline bool vtime_accounting_enabled_cpu(int cpu) { - return context_tracking_enabled_cpu(cpu); + return IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) || + context_tracking_enabled_cpu(cpu); } static inline bool vtime_accounting_enabled_this_cpu(void) { - return context_tracking_enabled_this_cpu(); + return IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) || + context_tracking_enabled_this_cpu(); } extern void vtime_task_switch_generic(struct task_struct *prev); diff --git a/init/Kconfig b/init/Kconfig index 9ffb103fc927b..86877e1f416fc 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -473,6 +473,9 @@ menu "CPU/Task time and stats accounting" config VIRT_CPU_ACCOUNTING bool +config VIRT_CPU_ACCOUNTING_GEN + bool + choice prompt "Cputime accounting" default TICK_CPU_ACCOUNTING @@ -501,12 +504,13 @@ config VIRT_CPU_ACCOUNTING_NATIVE this also enables accounting of stolen time on logically-partitioned systems. -config VIRT_CPU_ACCOUNTING_GEN +config VIRT_CPU_ACCOUNTING_DYNTICKS bool "Full dynticks CPU time accounting" depends on HAVE_CONTEXT_TRACKING_USER depends on HAVE_VIRT_CPU_ACCOUNTING_GEN depends on GENERIC_CLOCKEVENTS select VIRT_CPU_ACCOUNTING + select VIRT_CPU_ACCOUNTING_GEN select CONTEXT_TRACKING_USER help Select this option to enable task and CPU time accounting on full @@ -520,8 +524,26 @@ config VIRT_CPU_ACCOUNTING_GEN If unsure, say N. +config VIRT_CPU_ACCOUNTING_GEN_FORCE + bool "Force full dynticks CPU time accounting" + depends on HAVE_CONTEXT_TRACKING_USER + depends on HAVE_VIRT_CPU_ACCOUNTING_GEN + depends on GENERIC_CLOCKEVENTS + select VIRT_CPU_ACCOUNTING + select VIRT_CPU_ACCOUNTING_GEN + select CONTEXT_TRACKING_USER + help + Select this option to forcibly enable the full dynticks CPU time + accounting. This accounting is implemented by watching every + kernel-user boundaries using the context tracking subsystem. The + accounting is thus performed at the expense of some overhead, but is + more precise than tick based CPU accounting. + + If unsure, say N. + endchoice + config IRQ_TIME_ACCOUNTING bool "Fine granularity task level IRQ time accounting" depends on HAVE_IRQ_TIME_ACCOUNTING && !VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6ef0b35fc28c5..f70949430cf11 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -537,6 +537,13 @@ void noinstr __ct_user_enter(enum ctx_state state) */ raw_atomic_add(state, &ct->state); } + + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) && + state == CONTEXT_USER) { + instrumentation_begin(); + vtime_user_enter(current); + instrumentation_end(); + } } } context_tracking_recursion_exit(); @@ -645,6 +652,13 @@ void noinstr __ct_user_exit(enum ctx_state state) */ raw_atomic_sub(state, &ct->state); } + + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) && + state == CONTEXT_USER) { + instrumentation_begin(); + vtime_user_exit(current); + instrumentation_end(); + } } } context_tracking_recursion_exit(); @@ -715,13 +729,18 @@ void __init ct_cpu_track_user(int cpu) initialized = true; } -#ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE +#if defined(CONFIG_CONTEXT_TRACKING_USER_FORCE) || defined(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE) void __init context_tracking_init(void) { int cpu; - for_each_possible_cpu(cpu) - ct_cpu_track_user(cpu); + if (IS_ENABLED(CONFIG_CONTEXT_TRACKING_USER_FORCE)) { + for_each_possible_cpu(cpu) + ct_cpu_track_user(cpu); + } + + if (IS_ENABLED(CONFIG_VIRT_CPU_ACCOUNTING_GEN_FORCE)) + static_branch_inc(&context_tracking_key); } #endif -- 2.40.1