[PATCH] sched: introduce configurable delay before entering idle

Marcelo Tosatti <mtosatti@xxxxxxxxxx> · Tue, 7 May 2019 15:56:49 -0300

Certain workloads perform poorly on KVM compared to baremetal
due to baremetal's ability to perform mwait on NEED_RESCHED
bit of task flags (therefore skipping the IPI).

This patch introduces a configurable busy-wait delay before entering the
architecture delay routine, allowing wakeup IPIs to be skipped 
(if the IPI happens in that window).

The real-life workload which this patch improves performance
is SAP HANA (by 5-10%) (for which case setting idle_spin to 30 
is sufficient).

This patch improves the attached server.py and client.py example 
as follows:

Host:                           31.814230202231556
Guest:                          38.17718765199993       (83 %)
Guest, idle_spin=50us:          33.317709898000004      (95 %)
Guest, idle_spin=220us:         32.27826551499999       (98 %)

Signed-off-by: Marcelo Tosatti <mtosatti@xxxxxxxxxx>

---
 kernel/sched/idle.c |   86 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f5516bae0c1b..bca7656a7ea0 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -216,6 +216,29 @@ static void cpuidle_idle_call(void)
 	rcu_idle_exit();
 }
 
+static unsigned int spin_before_idle_us;

+static void do_spin_before_idle(void)
+{
+	ktime_t now, end_spin;
+
+	now = ktime_get();
+	end_spin = ktime_add_ns(now, spin_before_idle_us*1000);
+
+	rcu_idle_enter();
+	local_irq_enable();
+	stop_critical_timings();
+
+	do {
+		cpu_relax();
+		now = ktime_get();
+	} while (!tif_need_resched() && ktime_before(now, end_spin));
+
+	start_critical_timings();
+	rcu_idle_exit();
+	local_irq_disable();
+}
+
 /*
  * Generic idle loop implementation
  *
@@ -259,6 +282,8 @@ static void do_idle(void)
 			tick_nohz_idle_restart_tick();
 			cpu_idle_poll();
 		} else {
+			if (spin_before_idle_us)
+				do_spin_before_idle();
 			cpuidle_idle_call();
 		}
 		arch_cpu_idle_exit();
@@ -465,3 +490,64 @@ const struct sched_class idle_sched_class = {
 	.switched_to		= switched_to_idle,
 	.update_curr		= update_curr_idle,
 };
+
+
+static ssize_t store_idle_spin(struct kobject *kobj,
+			       struct kobj_attribute *attr,
+			       const char *buf, size_t count)
+{
+	unsigned int val;
+
+	if (kstrtouint(buf, 10, &val) < 0)
+		return -EINVAL;
+
+	if (val > USEC_PER_SEC)
+		return -EINVAL;
+
+	spin_before_idle_us = val;
+	return count;
+}
+
+static ssize_t show_idle_spin(struct kobject *kobj,
+			      struct kobj_attribute *attr,
+			      char *buf)
+{
+	ssize_t ret;
+
+	ret = sprintf(buf, "%d\n", spin_before_idle_us);
+
+	return ret;
+}
+
+static struct kobj_attribute idle_spin_attr =
+	__ATTR(idle_spin, 0644, show_idle_spin, store_idle_spin);
+
+static struct attribute *sched_attrs[] = {
+	&idle_spin_attr.attr,
+	NULL,
+};
+
+static const struct attribute_group sched_attr_group = {
+	.attrs = sched_attrs,
+};
+
+static struct kobject *sched_kobj;
+
+static int __init sched_sysfs_init(void)
+{
+	int error;
+
+	sched_kobj = kobject_create_and_add("sched", kernel_kobj);
+	if (!sched_kobj)
+		return -ENOMEM;
+
+	error = sysfs_create_group(sched_kobj, &sched_attr_group);
+	if (error)
+		goto err;
+	return 0;
+
+err:
+	kobject_put(sched_kobj);
+	return error;
+}
+postcore_initcall(sched_sysfs_init);
#!/bin/python3

import socket
import sys
import struct, fcntl, os
import os, errno, time
import time

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

server_address = ('127.0.0.1', 999)
print ("connecting to 127.0.0.1")
sock.connect(server_address)

nr_writes = 0

start_time = time.clock_gettime(time.CLOCK_MONOTONIC)

while nr_writes < 90000:
	data = sock.recv(4096)
	if len(data) == 0:
		print("connection closed!\n");
		exit(0);
	# sleep 20us
	time.sleep(20/1000000)
	sock.send(data)
	nr_writes = nr_writes+1

end_time = time.clock_gettime(time.CLOCK_MONOTONIC)
delta = end_time - start_time
print(delta)
#!/bin/python3

import socket
import sys
import struct, fcntl, os
import os, errno, time
import time

sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.bind(('127.0.0.1', 999))
sock.listen(10)
conn, addr = sock.accept()

nr_written=0
while 1:
	conn.sendall(b"a response line of text")
	data = conn.recv(1024)
	if not data:
        	break
	# sleep 200us
	time.sleep(200/1000000)
	nr_written = nr_written+1