Here is the data i get when running benchmark contextswitch
(https://github.com/tsuna/contextswitch)
before patch:
2000000 process context switches in 4822613801ns (2411.3ns/ctxsw)
after patch:
2000000 process context switches in 3584098241ns (1792.0ns/ctxsw)
Signed-off-by: Yang Zhang <yang.zhang.wz@xxxxxxxxx>
---
Documentation/sysctl/kernel.txt | 10 ++++++++++
arch/x86/kernel/process.c | 21 +++++++++++++++++++++
include/linux/kernel.h | 3 +++
kernel/sched/idle.c | 3 +++
kernel/sysctl.c | 9 +++++++++
5 files changed, 46 insertions(+)
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index bac23c1..4e71bfe 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -63,6 +63,7 @@ show up in /proc/sys/kernel:
- perf_event_max_stack
- perf_event_max_contexts_per_stack
- pid_max
+- poll_threshold_ns [ X86 only ]
- powersave-nap [ PPC only ]
- printk
- printk_delay
@@ -702,6 +703,15 @@ kernel tries to allocate a number starting from this one.
==============================================================
+poll_threshold_ns: (X86 only)
+
+This parameter used to control the max wait time to poll before going
+into real idle state. By default, the values is 0 means don't poll.
+It is recommended to change the value to non-zero if running latency-bound
+workloads in VM.
+
+==============================================================
+
powersave-nap: (PPC only)
If set, Linux-PPC will use the 'nap' mode of powersaving,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb8842..6361783 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,6 +39,10 @@
#include <asm/desc.h>
#include <asm/prctl.h>
+#ifdef CONFIG_HYPERVISOR_GUEST
+unsigned long poll_threshold_ns;
+#endif
+
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
@@ -313,6 +317,23 @@ static inline void play_dead(void)
}
#endif
+#ifdef CONFIG_HYPERVISOR_GUEST
+void arch_cpu_idle_poll(void)
+{
+ ktime_t start, cur, stop;
+
+ if (poll_threshold_ns) {
+ start = cur = ktime_get();
+ stop = ktime_add_ns(ktime_get(), poll_threshold_ns);
+ do {
+ if (need_resched())
+ break;
+ cur = ktime_get();
+ } while (ktime_before(cur, stop));
+ }
+}
+#endif
+
void arch_cpu_idle_enter(void)
{
tsc_verify_tsc_adjust(false);
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 13bc08a..04cf774 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -460,6 +460,9 @@ extern __scanf(2, 0)
extern int sysctl_panic_on_stackoverflow;
extern bool crash_kexec_post_notifiers;
+#ifdef CONFIG_HYPERVISOR_GUEST
+extern unsigned long poll_threshold_ns;
+#endif
/*
* panic_cpu is used for synchronizing panic() and crash_kexec() execution. It
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 2a25a9e..e789f99 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -74,6 +74,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
}
/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_poll(void) { }
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
@@ -219,6 +220,8 @@ static void do_idle(void)
*/
__current_set_polling();
+ arch_cpu_idle_poll();
+
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a..9174d57 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1203,6 +1203,15 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
.extra2 = &one,
},
#endif
+#ifdef CONFIG_HYPERVISOR_GUEST
+ {
+ .procname = "halt_poll_threshold",
+ .data = &poll_threshold_ns,
+ .maxlen = sizeof(unsigned long),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{ }
};
--
1.8.3.1