[PATCH] handle offline CPUs

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



So far the code had trouble dealing with the situation of offline
CPUs.  In such a scenario the lock_on_cpu failed and thus the whole
tracing was shut down since an error was detected there and thus
nthreads_running != ncpus.

This change introduces a new thread_status Th_offline, which is kind
of an acceptable error status that is set on the thread when it could
not be started due to the CPU being offline.

To achieve that we read /proc/stat on startup and record all online
CPUs in a CPU set, such that we later in the thread can determine
whether the relevant CPU is online or leave early with thread_status
Th_offline otherwise.  For getting the application up Th_offline is
counted towards success, such that the overall application does not
fail due to offline CPUs.

Signed-off-by: Robert Schiele <rschiele@xxxxxxxxx>
---

Some more comments on this code:

It would be great if someone with deeper understanding of the state
machine could review the code, especially under the aspect of
introducing the new thread_status value Th_offline.

Additionally I was looking for a method to determine the set of
online CPUs without parsing test files from the proc file system
but couldn't find one.  Thus if someone is aware of a smarter way
to collect this information I would like to hear about it.

Robert

 blktrace.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 61 insertions(+), 3 deletions(-)

diff --git a/blktrace.c b/blktrace.c
index 3c8fb4c..f8a54d0 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -73,6 +73,7 @@ enum {
 
 enum thread_status {
 	Th_running,
+	Th_offline,
 	Th_leaving,
 	Th_error
 };
@@ -275,6 +276,8 @@ int data_is_native = -1;
 
 static int ndevs;
 static int ncpus;
+static cpu_set_t * cpu_online_mask;
+static size_t cpu_online_mask_size;
 static int pagesize;
 static int act_mask = ~0U;
 static int kill_running_trace;
@@ -309,6 +312,7 @@ static volatile int dp_entries;
 static pthread_cond_t mt_cond = PTHREAD_COND_INITIALIZER;
 static pthread_mutex_t mt_mutex = PTHREAD_MUTEX_INITIALIZER;
 static volatile int nthreads_running;
+static volatile int nthreads_offline;
 static volatile int nthreads_leaving;
 static volatile int nthreads_error;
 static volatile int tracers_run;
@@ -567,6 +571,8 @@ static void tracer_signal_ready(struct tracer *tp,
 
 	if (th_status == Th_running)
 		nthreads_running++;
+	else if (th_status == Th_offline)
+		nthreads_offline++;
 	else if (th_status == Th_error)
 		nthreads_error++;
 	else
@@ -579,7 +585,7 @@ static void tracer_signal_ready(struct tracer *tp,
 static void wait_tracers_ready(int ncpus_started)
 {
 	pthread_mutex_lock(&mt_mutex);
-	while ((nthreads_running + nthreads_error) < ncpus_started)
+	while ((nthreads_running + nthreads_offline + nthreads_error) < ncpus_started)
 		t_pthread_cond_wait(&mt_cond, &mt_mutex);
 	pthread_mutex_unlock(&mt_mutex);
 }
@@ -1806,11 +1812,21 @@ static int handle_pfds_entries(struct tracer *tp, int nevs, int force_read)
 	return nentries;
 }
 
+static int cpu_online(int cpu) {
+	return CPU_ISSET_S(cpu, cpu_online_mask_size, cpu_online_mask);
+}
+
 static void *thread_main(void *arg)
 {
 	int ret, ndone, to_val;
 	struct tracer *tp = arg;
 
+	if (! cpu_online(tp->cpu)) {
+		fprintf(stderr, "skip offline CPU %d\n", tp->cpu);
+		tracer_signal_ready(tp, Th_offline, 0);
+		return NULL;
+	}
+
 	ret = lock_on_cpu(tp->cpu);
 	if (ret)
 		goto err;
@@ -2637,7 +2653,7 @@ static int run_tracers(void)
 	}
 
 	start_tracers();
-	if (nthreads_running == ncpus) {
+	if (nthreads_running + nthreads_offline == ncpus) {
 		unblock_tracers();
 		start_buts();
 		if (net_mode == Net_client)
@@ -2648,7 +2664,7 @@ static int run_tracers(void)
 		stop_tracers();
 
 	wait_tracers();
-	if (nthreads_running == ncpus)
+	if (nthreads_running + nthreads_offline == ncpus)
 		show_stats(&devpaths);
 	if (net_client_use_send())
 		close_client_connections();
@@ -2657,6 +2673,42 @@ static int run_tracers(void)
 	return 0;
 }
 
+static int read_online_cpus() {
+	cpu_online_mask = CPU_ALLOC(ncpus);
+	cpu_online_mask_size = CPU_ALLOC_SIZE(ncpus);
+
+	CPU_ZERO_S(cpu_online_mask_size, cpu_online_mask);
+
+	if (ncpus > 1) {
+		FILE *fp;
+		char line[4096];
+
+		fp = my_fopen("/proc/stat", "r");
+		if (fp == NULL) {
+			CPU_FREE(cpu_online_mask);
+			fprintf(stderr, "opening /proc/stat failed %d/%s\n",
+				errno, strerror(errno));
+			return 1;
+		}
+
+		while (fgets(line, 4096, fp)) {
+			if ((! memcmp(line, "cpu", 3)) && line[3] != ' ') {
+				int cpu;
+				if (sscanf(line + 3, "%d ", &cpu) == 1)
+					CPU_SET_S(cpu, cpu_online_mask_size, cpu_online_mask);
+				else {
+					CPU_FREE(cpu_online_mask);
+					fprintf(stderr, "parsing /proc/stat failed\n");
+					return 1;
+				}
+			}
+		}
+	} else
+		CPU_SET_S(0, cpu_online_mask_size, cpu_online_mask);
+
+	return 0;
+}
+
 int main(int argc, char *argv[])
 {
 	int ret = 0;
@@ -2673,6 +2725,10 @@ int main(int argc, char *argv[])
 		ret = 1;
 		goto out;
 	}
+	if (read_online_cpus()) {
+		ret = 1;
+		goto out;
+	}
 
 	if (ndevs > 1 && output_name && strcmp(output_name, "-") != 0) {
 		fprintf(stderr, "-o not supported with multiple devices\n");
@@ -2707,6 +2763,8 @@ int main(int argc, char *argv[])
 	} else
 		ret = run_tracers();
 
+	CPU_FREE(cpu_online_mask);
+
 out:
 	if (pfp)
 		fclose(pfp);
-- 
1.8.4.5
--
To unsubscribe from this list: send the line "unsubscribe linux-btrace" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Netdev]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux