Re: cpus_allowed per thread behavior

Jens Axboe <axboe@xxxxxxxxx> · Thu, 27 Feb 2014 09:52:08 -0800

On Thu, Feb 27 2014, Jens Axboe wrote:
> On 2014-02-26 17:12, Elliott, Robert (Server Storage) wrote:
> >>-----Original Message-----
> >>From: Jens Axboe [mailto:axboe@xxxxxxxxx]
> >>Sent: Wednesday, 26 February, 2014 6:08 PM
> >>To: Elliott, Robert (Server Storage); fio@xxxxxxxxxxxxxxx
> >>Subject: Re: cpus_allowed per thread behavior
> >>
> >>On 2014-02-26 15:54, Elliott, Robert (Server Storage) wrote:
> >>>fio seems to assign the same cpus_allowed/cpumask value to all threads.
> >>  > I think this allows the OS to move the threads around those CPUs.
> >>
> >>Correct. As long as the number of cpus in the mask is equal to (or
> >>larger than) the number of jobs within that group, the OS is free to
> >>place them wherever it wants. In practice, unless the CPU scheduling is
> >>horribly broken, they tend to "stick" for most intents and purposes.
> >>
> >>>In comparison, iometer assigns its worker threads to specific CPUs
> >>  > within the cpumask in round-robin manner.  Would that be worth adding
> >>  > to fio, perhaps with an option like cpus_allowed_policy=roundrobin?
> >>
> >>Sure, we could add that feature. You can get the same setup now, if you
> >>"unroll" the job section, but that might not always be practical. How
> >>about cpus_allowed_policy, with 'shared' being the existing (and
> >>default) behavior and 'split' being each thread grabbing one of the CPUs?
> >
> >Perhaps NUMA and hyperthreading aware allocation policies would
> >also be useful?
> >
> >I don't know how consistent hyperthread CPU numbering is across
> >systems.  On some servers I've tried, linux assigns 0-5 to the main
> >cores and 6-11 to the hyperthreaded siblings, while Windows assigns
> >0,2,4,6,8,10 to the main cores and 1,3,5,7,9,11 to their
> >hyperthreaded siblings.
> 
> Linux follows the firmware on that, at least as far as I know. I've
> seen machines renumber when getting a new firmware, going from the
> second scheme you list to the first. But for the below, we cannot
> assume any of them, on some machines you also have > 2 threads per
> core. So the topology would have to be queried.

Here's a test patch that implements the shared/split policy.

diff --git a/HOWTO b/HOWTO
index 4dacd98965ea..040b8a8949c6 100644
--- a/HOWTO
+++ b/HOWTO
@@ -928,6 +928,17 @@ cpus_allowed=str Controls the same options as cpumask, but it allows a text
 		allows a range of CPUs. Say you wanted a binding to CPUs
 		1, 5, and 8-15, you would set cpus_allowed=1,5,8-15.
 
+cpus_allowed_policy=str Set the policy of how fio distributes the CPUs
+		specified by cpus_allowed or cpumask. Two policies are
+		supported:
+
+		shared	All jobs will share the CPU set specified.
+		split	Each job will get a unique CPU from the CPU set.
+
+		If shared is the default behaviour, if the option isn't
+		specified. If split is specified, then fio will error out if
+		there are more jobs defined than CPUs given in the set.
+
 numa_cpu_nodes=str Set this job running on spcified NUMA nodes' CPUs. The
 		arguments allow comma delimited list of cpu numbers,
 		A-B ranges, or 'all'. Note, to enable numa options support,
diff --git a/backend.c b/backend.c
index ee395bd0ea57..12c76d8545ef 100644
--- a/backend.c
+++ b/backend.c
@@ -1278,6 +1278,15 @@ static void *thread_main(void *data)
 	 * allocations.
 	 */
 	if (o->cpumask_set) {
+		if (o->cpus_allowed_policy == FIO_CPUS_SPLIT) {
+			ret = fio_cpus_split(&o->cpumask, td->thread_number);
+			if (!ret) {
+				log_err("fio: no CPUs set\n");
+				log_err("fio: Try increasing number of available CPUs\n");
+				td_verror(td, EINVAL, "cpus_split");
+				goto err;
+			}
+		}
 		ret = fio_setaffinity(td->pid, o->cpumask);
 		if (ret == -1) {
 			td_verror(td, errno, "cpu_set_affinity");
diff --git a/cconv.c b/cconv.c
index fd8d0ad85142..357a7845e559 100644
--- a/cconv.c
+++ b/cconv.c
@@ -188,6 +188,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->numjobs = le32_to_cpu(top->numjobs);
 	o->cpumask_set = le32_to_cpu(top->cpumask_set);
 	o->verify_cpumask_set = le32_to_cpu(top->verify_cpumask_set);
+	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
 	o->iolog = le32_to_cpu(top->iolog);
 	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
 	o->nice = le32_to_cpu(top->nice);
@@ -343,6 +344,7 @@ void convert_thread_options_to_net(struct thread_options_pack *top,
 	top->numjobs = cpu_to_le32(o->numjobs);
 	top->cpumask_set = cpu_to_le32(o->cpumask_set);
 	top->verify_cpumask_set = cpu_to_le32(o->verify_cpumask_set);
+	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
 	top->iolog = cpu_to_le32(o->iolog);
 	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
 	top->nice = cpu_to_le32(o->nice);
diff --git a/fio.1 b/fio.1
index c530d8440cd0..294e3836b4e5 100644
--- a/fio.1
+++ b/fio.1
@@ -833,6 +833,23 @@ may run on.  See \fBsched_setaffinity\fR\|(2).
 .BI cpus_allowed \fR=\fPstr
 Same as \fBcpumask\fR, but allows a comma-delimited list of CPU numbers.
 .TP
+.BI cpus_allowed_policy \fR=\fPstr
+Set the policy of how fio distributes the CPUs specified by \fBcpus_allowed\fR
+or \fBcpumask\fR. Two policies are supported:
+.RS
+.RS
+.TP
+.B shared
+All jobs will share the CPU set specified.
+.TP
+.B split
+Each job will get a unique CPU from the CPU set.
+.RE
+.P
+If \fBshared\fR is the default behaviour, if the option isn't specified. If
+\fBsplit\fR is specified, then fio will error out if there are more jobs
+defined than CPUs given in the set.
+.TP
 .BI numa_cpu_nodes \fR=\fPstr
 Set this job running on specified NUMA nodes' CPUs. The arguments allow
 comma delimited list of cpu numbers, A-B ranges, or 'all'.
diff --git a/fio.h b/fio.h
index 9159b0c2de3e..6f5f29fb3a97 100644
--- a/fio.h
+++ b/fio.h
@@ -629,4 +629,9 @@ enum {
 	FIO_RAND_GEN_LFSR,
 };
 
+enum {
+	FIO_CPUS_SHARED		= 0,
+	FIO_CPUS_SPLIT,
+};
+
 #endif
diff --git a/options.c b/options.c
index 6d3956e307bf..c1a8f323e956 100644
--- a/options.c
+++ b/options.c
@@ -394,6 +394,21 @@ static int str_exitall_cb(void)
 }
 
 #ifdef FIO_HAVE_CPU_AFFINITY
+int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu)
+{
+	const long max_cpu = cpus_online();
+	unsigned int i;
+
+	for (i = 0; i < max_cpu; i++) {
+		if (cpu != i) {
+			fio_cpu_clear(mask, i);
+			continue;
+		}
+	}
+
+	return fio_cpu_count(mask);
+}
+
 static int str_cpumask_cb(void *data, unsigned long long *val)
 {
 	struct thread_data *td = data;
@@ -2875,6 +2890,27 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
 	},
+	{
+		.name	= "cpus_allowed_policy",
+		.lname	= "CPUs allowed distribution policy",
+		.type	= FIO_OPT_STR,
+		.off1	= td_var_offset(cpus_allowed_policy),
+		.help	= "Distribution policy for cpus_allowed",
+		.parent = "cpus_allowed",
+		.prio	= 1,
+		.posval = {
+			  { .ival = "shared",
+			    .oval = FIO_CPUS_SHARED,
+			    .help = "Mask shared between threads",
+			  },
+			  { .ival = "split",
+			    .oval = FIO_CPUS_SPLIT,
+			    .help = "Mask split between threads",
+			  },
+		},
+		.category = FIO_OPT_C_GENERAL,
+		.group	= FIO_OPT_G_CRED,
+	},
 #endif
 #ifdef CONFIG_LIBNUMA
 	{
diff --git a/os/os-freebsd.h b/os/os-freebsd.h
index 57ce409c67fd..402792a0f7d7 100644
--- a/os/os-freebsd.h
+++ b/os/os-freebsd.h
@@ -32,6 +32,7 @@ typedef cpuset_t os_cpu_mask_t;
 
 #define fio_cpu_clear(mask, cpu)        (void) CPU_CLR((cpu), (mask))
 #define fio_cpu_set(mask, cpu)          (void) CPU_SET((cpu), (mask))
+#define fio_cpu_count(maks)		CPU_COUNT((mask))
 
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
diff --git a/os/os-linux.h b/os/os-linux.h
index 5d1d62db27a0..3ed8c2ef31f2 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -61,6 +61,7 @@ typedef struct drand48_data os_random_state_t;
 
 #define fio_cpu_clear(mask, cpu)	(void) CPU_CLR((cpu), (mask))
 #define fio_cpu_set(mask, cpu)		(void) CPU_SET((cpu), (mask))
+#define fio_cpu_count(maks)		CPU_COUNT((mask))
 
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
diff --git a/os/os-solaris.h b/os/os-solaris.h
index e6612118ace4..7a0a3f0bfeca 100644
--- a/os/os-solaris.h
+++ b/os/os-solaris.h
@@ -111,6 +111,16 @@ static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 	return 0;
 }
 
+static inline int fio_cpuset_count(os_cpu_mask_t *mask)
+{
+	unsigned int num_cpus;
+
+	if (pset_info(*mask, NULL, &num_cpus, NULL) < 0)
+		return 0;
+
+	return num_cpus;
+}
+
 static inline int fio_cpuset_exit(os_cpu_mask_t *mask)
 {
 	if (pset_destroy(*mask) < 0)
diff --git a/os/os-windows.h b/os/os-windows.h
index de120b64ff7e..7bfe3d2255e4 100644
--- a/os/os-windows.h
+++ b/os/os-windows.h
@@ -214,6 +214,11 @@ static inline void fio_cpu_set(os_cpu_mask_t *mask, int cpu)
 	*mask |= 1 << cpu;
 }
 
+static inline int fio_cpu_count(os_cpu_mask_t *mask, int cpu)
+{
+	return hweight64(*mask);
+}
+
 static inline int fio_cpuset_init(os_cpu_mask_t *mask)
 {
 	*mask = 0;
diff --git a/os/os.h b/os/os.h
index 03d1e9a14565..a6bc17f09b57 100644
--- a/os/os.h
+++ b/os/os.h
@@ -80,7 +80,10 @@ typedef struct aiocb os_aiocb_t;
 #define fio_getaffinity(pid, mask)	do { } while (0)
 #define fio_cpu_clear(mask, cpu)	do { } while (0)
 #define fio_cpuset_exit(mask)		(-1)
+#define fio_cpus_split(mask, cpu)	(0)
 typedef unsigned long os_cpu_mask_t;
+#else
+extern int fio_cpus_split(os_cpu_mask_t *mask, unsigned int cpu);
 #endif
 
 #ifndef FIO_HAVE_IOPRIO
diff --git a/thread_options.h b/thread_options.h
index 14a4e54abcc7..4ea6ebd06a0c 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -155,6 +155,7 @@ struct thread_options {
 	unsigned int cpumask_set;
 	os_cpu_mask_t verify_cpumask;
 	unsigned int verify_cpumask_set;
+	unsigned int cpus_allowed_policy;
 #ifdef CONFIG_LIBNUMA
 	struct bitmask *numa_cpunodesmask;
 	unsigned int numa_cpumask_set;
@@ -378,6 +379,7 @@ struct thread_options_pack {
 	uint32_t cpumask_set;
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
 	uint32_t verify_cpumask_set;
+	uint32_t cpus_allowed_policy;
 	uint32_t iolog;
 	uint32_t rwmixcycle;
 	uint32_t rwmix[DDIR_RWDIR_CNT];

-- 
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe fio" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html