[PATCH 1/2] perf bench futex: cache align the worer struct

LKML Archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/2] perf bench futex: cache align the worer struct
@ 2016-10-16 19:08 Sebastian Andrzej Siewior
  2016-10-16 19:08 ` [PATCH 2/2] perf bench futex: add NUMA support Sebastian Andrzej Siewior
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-16 19:08 UTC (permalink / raw
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Davidlohr Bueso,
	Sebastian Andrzej Siewior

It popped up in perf testing that the worker consumes some amount of
CPU. It boils down to the increment of `ops` which causes cache line
bouncing between the individual threads.
The patch aligns the struct by 256 bytes to ensure that not a cache line
is shared among CPUs. 128 byte is the x86 worst case and grep says that
L1_CACHE_SHIFT is set to 8 on s390.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 tools/perf/bench/futex-hash.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..d9e5e80bb4d0 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -39,12 +39,15 @@ static unsigned int threads_starting;
 static struct stats throughput_stats;
 static pthread_cond_t thread_parent, thread_worker;
 
+#define SMP_CACHE_BYTES 256
+#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES)))
+
 struct worker {
 	int tid;
 	u_int32_t *futex;
 	pthread_t thread;
 	unsigned long ops;
-};
+} __cacheline_aligned;
 
 static const struct option options[] = {
 	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] perf bench futex: add NUMA support
  2016-10-16 19:08 [PATCH 1/2] perf bench futex: cache align the worer struct Sebastian Andrzej Siewior
@ 2016-10-16 19:08 ` Sebastian Andrzej Siewior
  2016-10-17 14:38   ` Arnaldo Carvalho de Melo
  2016-10-18  1:09 ` [PATCH 1/2] perf bench futex: cache align the worer struct Davidlohr Bueso
  2016-10-24 19:06 ` [tip:perf/core] perf bench futex: Cache align the worker struct tip-bot for Sebastian Andrzej Siewior
  2 siblings, 1 reply; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-16 19:08 UTC (permalink / raw
  To: Arnaldo Carvalho de Melo
  Cc: Peter Zijlstra, Ingo Molnar, linux-kernel, Davidlohr Bueso,
	Sebastian Andrzej Siewior

By default the application uses malloc() and all available CPUs. This
patch introduces NUMA support which means:
- memory is allocated node local via numa_alloc_local()
- all CPUs of the specified NUMA node are used. This is also true if the
  number of threads set is greater than the number of CPUs available on
  this node.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 tools/perf/bench/Build        |  4 ++
 tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
index 60bf11943047..9e6e518d7d62 100644
--- a/tools/perf/bench/Build
+++ b/tools/perf/bench/Build
@@ -1,3 +1,7 @@
+ifdef CONFIG_NUMA
+CFLAGS_futex-hash.o   += -DCONFIG_NUMA=1
+endif
+
 perf-y += sched-messaging.o
 perf-y += sched-pipe.o
 perf-y += mem-functions.o
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index d9e5e80bb4d0..8db4a5bd6a4e 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -25,6 +25,9 @@
 
 #include <err.h>
 #include <sys/time.h>
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif
 
 static unsigned int nthreads = 0;
 static unsigned int nsecs    = 10;
@@ -32,6 +35,7 @@ static unsigned int nsecs    = 10;
 static unsigned int nfutexes = 1024;
 static bool fshared = false, done = false, silent = false;
 static int futex_flag = 0;
+static int numa_node = -1;
 
 struct timeval start, end, runtime;
 static pthread_mutex_t thread_lock;
@@ -55,9 +59,28 @@ static const struct option options[] = {
 	OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
 	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
 	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
+#ifdef CONFIG_NUMA
+	OPT_INTEGER( 'n', "numa",   &numa_node,  "Specify the NUMA node"),
+#endif
 	OPT_END()
 };
 
+#ifndef CONFIG_NUMA
+static int numa_run_on_node(int node __maybe_unused) { return 0; }
+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
+static void *numa_alloc_local(size_t size) { return malloc(size); }
+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
+#endif
+
+static bool cpu_is_local(int cpu)
+{
+	if (numa_node < 0)
+		return true;
+	if (numa_node_of_cpu(cpu) == numa_node)
+		return true;
+	return false;
+}
+
 static const char * const bench_futex_hash_usage[] = {
 	"perf bench futex hash <options>",
 	NULL
@@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
 	unsigned int i, ncpus;
 	pthread_attr_t thread_attr;
 	struct worker *worker = NULL;
+	char *node_str = NULL;
+	unsigned int cpunum;
 
 	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
 	if (argc) {
@@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
-	if (!nthreads) /* default to the number of CPUs */
-		nthreads = ncpus;
+	if (!nthreads) {
+		/* default to the number of CPUs per NUMA node */
+		if (numa_node < 0) {
+			nthreads = ncpus;
+		} else {
+			for (i = 0; i < ncpus; i++) {
+				if (cpu_is_local(i))
+					nthreads++;
+			}
+			if (!nthreads)
+				err(EXIT_FAILURE, "No online CPUs for this node");
+		}
+	} else {
+		int cpu_available = 0;
 
-	worker = calloc(nthreads, sizeof(*worker));
+		for (i = 0; i < ncpus && !cpu_available; i++) {
+			if (cpu_is_local(i))
+				cpu_available = 1;
+		}
+		if (!cpu_available)
+			err(EXIT_FAILURE, "No online CPUs for this node");
+	}
+
+	if (numa_node >= 0) {
+		ret = numa_run_on_node(numa_node);
+		if (ret < 0)
+			err(EXIT_FAILURE, "numa_run_on_node");
+		ret = asprintf(&node_str, " on node %d", numa_node);
+		if (ret < 0)
+			err(EXIT_FAILURE, "numa_node, asprintf");
+	}
+
+	worker = numa_alloc_local(nthreads * sizeof(*worker));
 	if (!worker)
 		goto errmem;
 
 	if (!fshared)
 		futex_flag = FUTEX_PRIVATE_FLAG;
 
-	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
-	       getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
+	printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
+	       getpid(), nthreads,
+	       node_str ? : "",
+	       nfutexes, fshared ? "shared":"private",
+	       nsecs);
 
 	init_stats(&throughput_stats);
 	pthread_mutex_init(&thread_lock, NULL);
@@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
 	threads_starting = nthreads;
 	pthread_attr_init(&thread_attr);
 	gettimeofday(&start, NULL);
-	for (i = 0; i < nthreads; i++) {
+	for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
+
+		do {
+			if (cpu_is_local(cpunum))
+				break;
+			cpunum++;
+			if (cpunum > ncpus)
+				cpunum = 0;
+		} while (1);
+
 		worker[i].tid = i;
-		worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+		worker[i].futex = numa_alloc_local(nfutexes *
+						   sizeof(*worker[i].futex));
 		if (!worker[i].futex)
 			goto errmem;
 
 		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_SET(cpunum % ncpus, &cpu);
 
 		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
 		if (ret)
@@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
 				       &worker[i].futex[nfutexes-1], t);
 		}
 
-		free(worker[i].futex);
+		numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
 	}
 
 	print_summary();
 
-	free(worker);
+	numa_free(worker, nthreads * sizeof(*worker));
 	return ret;
 errmem:
 	err(EXIT_FAILURE, "calloc");
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] perf bench futex: add NUMA support
  2016-10-16 19:08 ` [PATCH 2/2] perf bench futex: add NUMA support Sebastian Andrzej Siewior
@ 2016-10-17 14:38   ` Arnaldo Carvalho de Melo
  2016-10-17 15:01     ` Jiri Olsa
  0 siblings, 1 reply; 16+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-10-17 14:38 UTC (permalink / raw
  To: Jiri Olsa
  Cc: Sebastian Andrzej Siewior, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> By default the application uses malloc() and all available CPUs. This
> patch introduces NUMA support which means:
> - memory is allocated node local via numa_alloc_local()
> - all CPUs of the specified NUMA node are used. This is also true if the
>   number of threads set is greater than the number of CPUs available on
>   this node.
> 
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> ---
>  tools/perf/bench/Build        |  4 ++
>  tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
>  2 files changed, 81 insertions(+), 10 deletions(-)
> 
> diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> index 60bf11943047..9e6e518d7d62 100644
> --- a/tools/perf/bench/Build
> +++ b/tools/perf/bench/Build
> @@ -1,3 +1,7 @@
> +ifdef CONFIG_NUMA
> +CFLAGS_futex-hash.o   += -DCONFIG_NUMA=1
> +endif

Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
available to tools?

>  perf-y += sched-messaging.o
>  perf-y += sched-pipe.o
>  perf-y += mem-functions.o
> diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
> index d9e5e80bb4d0..8db4a5bd6a4e 100644
> --- a/tools/perf/bench/futex-hash.c
> +++ b/tools/perf/bench/futex-hash.c
> @@ -25,6 +25,9 @@
>  
>  #include <err.h>
>  #include <sys/time.h>
> +#ifdef CONFIG_NUMA
> +#include <numa.h>
> +#endif
>  
>  static unsigned int nthreads = 0;
>  static unsigned int nsecs    = 10;
> @@ -32,6 +35,7 @@ static unsigned int nsecs    = 10;
>  static unsigned int nfutexes = 1024;
>  static bool fshared = false, done = false, silent = false;
>  static int futex_flag = 0;
> +static int numa_node = -1;
>  
>  struct timeval start, end, runtime;
>  static pthread_mutex_t thread_lock;
> @@ -55,9 +59,28 @@ static const struct option options[] = {
>  	OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
>  	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
>  	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
> +#ifdef CONFIG_NUMA
> +	OPT_INTEGER( 'n', "numa",   &numa_node,  "Specify the NUMA node"),
> +#endif
>  	OPT_END()
>  };
>  
> +#ifndef CONFIG_NUMA
> +static int numa_run_on_node(int node __maybe_unused) { return 0; }
> +static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
> +static void *numa_alloc_local(size_t size) { return malloc(size); }
> +static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
> +#endif
> +
> +static bool cpu_is_local(int cpu)
> +{
> +	if (numa_node < 0)
> +		return true;
> +	if (numa_node_of_cpu(cpu) == numa_node)
> +		return true;
> +	return false;
> +}
> +
>  static const char * const bench_futex_hash_usage[] = {
>  	"perf bench futex hash <options>",
>  	NULL
> @@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
>  	unsigned int i, ncpus;
>  	pthread_attr_t thread_attr;
>  	struct worker *worker = NULL;
> +	char *node_str = NULL;
> +	unsigned int cpunum;
>  
>  	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
>  	if (argc) {
> @@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
>  	act.sa_sigaction = toggle_done;
>  	sigaction(SIGINT, &act, NULL);
>  
> -	if (!nthreads) /* default to the number of CPUs */
> -		nthreads = ncpus;
> +	if (!nthreads) {
> +		/* default to the number of CPUs per NUMA node */
> +		if (numa_node < 0) {
> +			nthreads = ncpus;
> +		} else {
> +			for (i = 0; i < ncpus; i++) {
> +				if (cpu_is_local(i))
> +					nthreads++;
> +			}
> +			if (!nthreads)
> +				err(EXIT_FAILURE, "No online CPUs for this node");
> +		}
> +	} else {
> +		int cpu_available = 0;
>  
> -	worker = calloc(nthreads, sizeof(*worker));
> +		for (i = 0; i < ncpus && !cpu_available; i++) {
> +			if (cpu_is_local(i))
> +				cpu_available = 1;
> +		}
> +		if (!cpu_available)
> +			err(EXIT_FAILURE, "No online CPUs for this node");
> +	}
> +
> +	if (numa_node >= 0) {
> +		ret = numa_run_on_node(numa_node);
> +		if (ret < 0)
> +			err(EXIT_FAILURE, "numa_run_on_node");
> +		ret = asprintf(&node_str, " on node %d", numa_node);
> +		if (ret < 0)
> +			err(EXIT_FAILURE, "numa_node, asprintf");
> +	}
> +
> +	worker = numa_alloc_local(nthreads * sizeof(*worker));
>  	if (!worker)
>  		goto errmem;
>  
>  	if (!fshared)
>  		futex_flag = FUTEX_PRIVATE_FLAG;
>  
> -	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
> -	       getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
> +	printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
> +	       getpid(), nthreads,
> +	       node_str ? : "",
> +	       nfutexes, fshared ? "shared":"private",
> +	       nsecs);
>  
>  	init_stats(&throughput_stats);
>  	pthread_mutex_init(&thread_lock, NULL);
> @@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
>  	threads_starting = nthreads;
>  	pthread_attr_init(&thread_attr);
>  	gettimeofday(&start, NULL);
> -	for (i = 0; i < nthreads; i++) {
> +	for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
> +
> +		do {
> +			if (cpu_is_local(cpunum))
> +				break;
> +			cpunum++;
> +			if (cpunum > ncpus)
> +				cpunum = 0;
> +		} while (1);
> +
>  		worker[i].tid = i;
> -		worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
> +		worker[i].futex = numa_alloc_local(nfutexes *
> +						   sizeof(*worker[i].futex));
>  		if (!worker[i].futex)
>  			goto errmem;
>  
>  		CPU_ZERO(&cpu);
> -		CPU_SET(i % ncpus, &cpu);
> +		CPU_SET(cpunum % ncpus, &cpu);
>  
>  		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
>  		if (ret)
> @@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
>  				       &worker[i].futex[nfutexes-1], t);
>  		}
>  
> -		free(worker[i].futex);
> +		numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
>  	}
>  
>  	print_summary();
>  
> -	free(worker);
> +	numa_free(worker, nthreads * sizeof(*worker));
>  	return ret;
>  errmem:
>  	err(EXIT_FAILURE, "calloc");
> -- 
> 2.9.3

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] perf bench futex: add NUMA support
  2016-10-17 14:38   ` Arnaldo Carvalho de Melo
@ 2016-10-17 15:01     ` Jiri Olsa
  2016-10-17 15:04       ` Arnaldo Carvalho de Melo
  0 siblings, 1 reply; 16+ messages in thread
From: Jiri Olsa @ 2016-10-17 15:01 UTC (permalink / raw
  To: Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Sebastian Andrzej Siewior, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

On Mon, Oct 17, 2016 at 11:38:21AM -0300, Arnaldo Carvalho de Melo wrote:
> Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> > By default the application uses malloc() and all available CPUs. This
> > patch introduces NUMA support which means:
> > - memory is allocated node local via numa_alloc_local()
> > - all CPUs of the specified NUMA node are used. This is also true if the
> >   number of threads set is greater than the number of CPUs available on
> >   this node.
> > 
> > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > ---
> >  tools/perf/bench/Build        |  4 ++
> >  tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
> >  2 files changed, 81 insertions(+), 10 deletions(-)
> > 
> > diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> > index 60bf11943047..9e6e518d7d62 100644
> > --- a/tools/perf/bench/Build
> > +++ b/tools/perf/bench/Build
> > @@ -1,3 +1,7 @@
> > +ifdef CONFIG_NUMA
> > +CFLAGS_futex-hash.o   += -DCONFIG_NUMA=1
> > +endif
> 
> Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
> available to tools?

not directly ATM.. it's prepared for the .config customary setting feature

meanwhile we set HAVE_* defines, like for CONFIG_NUMA we have:
 -DHAVE_LIBNUMA_SUPPORT

you can check it in Makefile.config

jirka

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2] perf bench futex: add NUMA support
  2016-10-17 15:01     ` Jiri Olsa
@ 2016-10-17 15:04       ` Arnaldo Carvalho de Melo
  2016-10-17 15:33         ` [PATCH 2/2 v2] " Sebastian Andrzej Siewior
  0 siblings, 1 reply; 16+ messages in thread
From: Arnaldo Carvalho de Melo @ 2016-10-17 15:04 UTC (permalink / raw
  To: Jiri Olsa
  Cc: Jiri Olsa, Sebastian Andrzej Siewior, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

Em Mon, Oct 17, 2016 at 05:01:23PM +0200, Jiri Olsa escreveu:
> On Mon, Oct 17, 2016 at 11:38:21AM -0300, Arnaldo Carvalho de Melo wrote:
> > Em Sun, Oct 16, 2016 at 09:08:03PM +0200, Sebastian Andrzej Siewior escreveu:
> > > By default the application uses malloc() and all available CPUs. This
> > > patch introduces NUMA support which means:
> > > - memory is allocated node local via numa_alloc_local()
> > > - all CPUs of the specified NUMA node are used. This is also true if the
> > >   number of threads set is greater than the number of CPUs available on
> > >   this node.
> > > 
> > > Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
> > > ---
> > >  tools/perf/bench/Build        |  4 ++
> > >  tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
> > >  2 files changed, 81 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/tools/perf/bench/Build b/tools/perf/bench/Build
> > > index 60bf11943047..9e6e518d7d62 100644
> > > --- a/tools/perf/bench/Build
> > > +++ b/tools/perf/bench/Build
> > > @@ -1,3 +1,7 @@
> > > +ifdef CONFIG_NUMA
> > > +CFLAGS_futex-hash.o   += -DCONFIG_NUMA=1
> > > +endif
> > 
> > Jiri, do we really need this? I.e. aren't the CONFIG_FOO defines
> > available to tools?
> 
> not directly ATM.. it's prepared for the .config customary setting feature
> 
> meanwhile we set HAVE_* defines, like for CONFIG_NUMA we have:
>  -DHAVE_LIBNUMA_SUPPORT
> 
> you can check it in Makefile.config

So, Andrzej, can you use that instead? I merged the first patch already.

- Arnaldo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 2/2 v2] perf bench futex: add NUMA support
  2016-10-17 15:04       ` Arnaldo Carvalho de Melo
@ 2016-10-17 15:33         ` Sebastian Andrzej Siewior
  2016-10-19 18:16           ` Davidlohr Bueso
  2016-10-21  3:03           ` Davidlohr Bueso
  0 siblings, 2 replies; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-17 15:33 UTC (permalink / raw
  To: Arnaldo Carvalho de Melo
  Cc: Jiri Olsa, Jiri Olsa, Peter Zijlstra, Ingo Molnar, linux-kernel,
	Davidlohr Bueso

By default the application uses malloc() and all available CPUs. This
patch introduces NUMA support which means:
- memory is allocated node local via numa_alloc_local()
- all CPUs of the specified NUMA node are used. This is also true if the
  number of threads set is greater than the number of CPUs available on
  this node.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
v1…v2: use HAVE_LIBNUMA_SUPPORT instead the custom variant

 tools/perf/bench/futex-hash.c | 87 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 10 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index d9e5e80bb4d0..17d38ce829f8 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -25,6 +25,9 @@
 
 #include <err.h>
 #include <sys/time.h>
+#ifdef HAVE_LIBNUMA_SUPPORT
+#include <numa.h>
+#endif
 
 static unsigned int nthreads = 0;
 static unsigned int nsecs    = 10;
@@ -32,6 +35,7 @@ static unsigned int nsecs    = 10;
 static unsigned int nfutexes = 1024;
 static bool fshared = false, done = false, silent = false;
 static int futex_flag = 0;
+static int numa_node = -1;
 
 struct timeval start, end, runtime;
 static pthread_mutex_t thread_lock;
@@ -55,9 +59,28 @@ static const struct option options[] = {
 	OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
 	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
 	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
+#ifdef HAVE_LIBNUMA_SUPPORT
+	OPT_INTEGER( 'n', "numa",   &numa_node,  "Specify the NUMA node"),
+#endif
 	OPT_END()
 };
 
+#ifndef HAVE_LIBNUMA_SUPPORT
+static int numa_run_on_node(int node __maybe_unused) { return 0; }
+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
+static void *numa_alloc_local(size_t size) { return malloc(size); }
+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
+#endif
+
+static bool cpu_is_local(int cpu)
+{
+	if (numa_node < 0)
+		return true;
+	if (numa_node_of_cpu(cpu) == numa_node)
+		return true;
+	return false;
+}
+
 static const char * const bench_futex_hash_usage[] = {
 	"perf bench futex hash <options>",
 	NULL
@@ -123,6 +146,8 @@ int bench_futex_hash(int argc, const char **argv,
 	unsigned int i, ncpus;
 	pthread_attr_t thread_attr;
 	struct worker *worker = NULL;
+	char *node_str = NULL;
+	unsigned int cpunum;
 
 	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
 	if (argc) {
@@ -136,18 +161,50 @@ int bench_futex_hash(int argc, const char **argv,
 	act.sa_sigaction = toggle_done;
 	sigaction(SIGINT, &act, NULL);
 
-	if (!nthreads) /* default to the number of CPUs */
-		nthreads = ncpus;
+	if (!nthreads) {
+		/* default to the number of CPUs per NUMA node */
+		if (numa_node < 0) {
+			nthreads = ncpus;
+		} else {
+			for (i = 0; i < ncpus; i++) {
+				if (cpu_is_local(i))
+					nthreads++;
+			}
+			if (!nthreads)
+				err(EXIT_FAILURE, "No online CPUs for this node");
+		}
+	} else {
+		int cpu_available = 0;
 
-	worker = calloc(nthreads, sizeof(*worker));
+		for (i = 0; i < ncpus && !cpu_available; i++) {
+			if (cpu_is_local(i))
+				cpu_available = 1;
+		}
+		if (!cpu_available)
+			err(EXIT_FAILURE, "No online CPUs for this node");
+	}
+
+	if (numa_node >= 0) {
+		ret = numa_run_on_node(numa_node);
+		if (ret < 0)
+			err(EXIT_FAILURE, "numa_run_on_node");
+		ret = asprintf(&node_str, " on node %d", numa_node);
+		if (ret < 0)
+			err(EXIT_FAILURE, "numa_node, asprintf");
+	}
+
+	worker = numa_alloc_local(nthreads * sizeof(*worker));
 	if (!worker)
 		goto errmem;
 
 	if (!fshared)
 		futex_flag = FUTEX_PRIVATE_FLAG;
 
-	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
-	       getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
+	printf("Run summary [PID %d]: %d threads%s, each operating on %d [%s] futexes for %d secs.\n\n",
+	       getpid(), nthreads,
+	       node_str ? : "",
+	       nfutexes, fshared ? "shared":"private",
+	       nsecs);
 
 	init_stats(&throughput_stats);
 	pthread_mutex_init(&thread_lock, NULL);
@@ -157,14 +214,24 @@ int bench_futex_hash(int argc, const char **argv,
 	threads_starting = nthreads;
 	pthread_attr_init(&thread_attr);
 	gettimeofday(&start, NULL);
-	for (i = 0; i < nthreads; i++) {
+	for (cpunum = 0, i = 0; i < nthreads; i++, cpunum++) {
+
+		do {
+			if (cpu_is_local(cpunum))
+				break;
+			cpunum++;
+			if (cpunum > ncpus)
+				cpunum = 0;
+		} while (1);
+
 		worker[i].tid = i;
-		worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+		worker[i].futex = numa_alloc_local(nfutexes *
+						   sizeof(*worker[i].futex));
 		if (!worker[i].futex)
 			goto errmem;
 
 		CPU_ZERO(&cpu);
-		CPU_SET(i % ncpus, &cpu);
+		CPU_SET(cpunum % ncpus, &cpu);
 
 		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
 		if (ret)
@@ -211,12 +278,12 @@ int bench_futex_hash(int argc, const char **argv,
 				       &worker[i].futex[nfutexes-1], t);
 		}
 
-		free(worker[i].futex);
+		numa_free(worker[i].futex, nfutexes * sizeof(*worker[i].futex));
 	}
 
 	print_summary();
 
-	free(worker);
+	numa_free(worker, nthreads * sizeof(*worker));
 	return ret;
 errmem:
 	err(EXIT_FAILURE, "calloc");
-- 
2.9.3

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] perf bench futex: cache align the worer struct
  2016-10-16 19:08 [PATCH 1/2] perf bench futex: cache align the worer struct Sebastian Andrzej Siewior
  2016-10-16 19:08 ` [PATCH 2/2] perf bench futex: add NUMA support Sebastian Andrzej Siewior
@ 2016-10-18  1:09 ` Davidlohr Bueso
  2016-10-19 13:07   ` Sebastian Andrzej Siewior
  2016-10-24 19:06 ` [tip:perf/core] perf bench futex: Cache align the worker struct tip-bot for Sebastian Andrzej Siewior
  2 siblings, 1 reply; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-18  1:09 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

On Sun, 16 Oct 2016, Sebastian Andrzej Siewior wrote:

>It popped up in perf testing that the worker consumes some amount of
>CPU. It boils down to the increment of `ops` which causes cache line
>bouncing between the individual threads.

Are you referring to this?

       â”‚         for (i = 0; i < nfutexes; i++, w->ops++) {
       â”‚ be:   add    $0x1,%ebx
 65.87 â”‚       addq   $0x1,0x18(%r12)

(which is like 65% of 13% on my box with a default futex-hash run).

Even better, could we get rid entirely of the ops increments and just
use a local variable, then update the worker at the end of the function.
The following makes 'perf' pretty much disappear in the profile.

Thanks,
Davidlohr

----8<--------------
From: Davidlohr Bueso <dave@stgolabs.net>
Subject: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing

Sebastian noted that overhead for worker thread ops (throughput)
accounting was producing 'perf' to appear in the profiles, consuming
a non-trivial (ie 13%) amount of CPU. This is due to cacheline
bouncing due to the increment of w->ops. We can easily fix this by
just working on a local copy and updating the actual worker once
done running, and ready to show the program summary. There is no
danger of the worker being concurrent, so we can trust that no stale
value is being seen by another thread.

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
 tools/perf/bench/futex-hash.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..0b9583164da0 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -63,8 +63,10 @@ static const char * const bench_futex_hash_usage[] = {
 static void *workerfn(void *arg)
 {
 	int ret;
-	unsigned int i;
 	struct worker *w = (struct worker *) arg;
+	unsigned int i;
+	unsigned long ops = w->ops; /* avoid cacheline-bouncing */
+
 
 	pthread_mutex_lock(&thread_lock);
 	threads_starting--;
@@ -74,7 +76,7 @@ static void *workerfn(void *arg)
 	pthread_mutex_unlock(&thread_lock);
 
 	do {
-		for (i = 0; i < nfutexes; i++, w->ops++) {
+		for (i = 0; i < nfutexes; i++, ops++) {
 			/*
 			 * We want the futex calls to fail in order to stress
 			 * the hashing of uaddr and not measure other steps,
@@ -88,6 +90,8 @@ static void *workerfn(void *arg)
 		}
 	}  while (!done);
 
+	w->ops = ops;
+
 	return NULL;
 }
 
-- 
2.6.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 1/2] perf bench futex: cache align the worer struct
  2016-10-18  1:09 ` [PATCH 1/2] perf bench futex: cache align the worer struct Davidlohr Bueso
@ 2016-10-19 13:07   ` Sebastian Andrzej Siewior
  2016-10-19 17:59     ` [PATCH] perf/bench-futex: Avoid worker cacheline bouncing Davidlohr Bueso
  0 siblings, 1 reply; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-19 13:07 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

On 2016-10-17 18:09:49 [-0700], Davidlohr Bueso wrote:
> On Sun, 16 Oct 2016, Sebastian Andrzej Siewior wrote:
> 
> > It popped up in perf testing that the worker consumes some amount of
> > CPU. It boils down to the increment of `ops` which causes cache line
> > bouncing between the individual threads.
> 
> Are you referring to this?
> 
>       â”‚         for (i = 0; i < nfutexes; i++, w->ops++) {
>       â”‚ be:   add    $0x1,%ebx
> 65.87 â”‚       addq   $0x1,0x18(%r12)
> 
> (which is like 65% of 13% on my box with a default futex-hash run).
correct.

> Even better, could we get rid entirely of the ops increments and just
> use a local variable, then update the worker at the end of the function.
> The following makes 'perf' pretty much disappear in the profile.

this should do it, too. So what remains is the read access for w->futex
but since it does not pop up in perf, it is probably not that important.

> Thanks,
> Davidlohr

Sebastian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH] perf/bench-futex: Avoid worker cacheline bouncing
  2016-10-19 13:07   ` Sebastian Andrzej Siewior
@ 2016-10-19 17:59     ` Davidlohr Bueso
  2016-10-19 18:13       ` Sebastian Andrzej Siewior
  0 siblings, 1 reply; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-19 17:59 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

Sebastian noted that overhead for worker thread ops (throughput)
accounting was producing 'perf' to appear in the profiles, consuming
a non-trivial (ie 13%) amount of CPU. This is due to cacheline
bouncing due to the increment of w->ops. We can easily fix this by
just working on a local copy and updating the actual worker once
done running, and ready to show the program summary. There is no
danger of the worker being concurrent, so we can trust that no stale
value is being seen by another thread.

Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
---
This is an updated version as I noticed lock-pi was also doing this.
acme, could you please pick up if no objection? Thanks.

 tools/perf/bench/futex-hash.c    | 6 ++++--
 tools/perf/bench/futex-lock-pi.c | 4 +++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5febd2..da04b8c5568a 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
 static void *workerfn(void *arg)
 {
 	int ret;
-	unsigned int i;
 	struct worker *w = (struct worker *) arg;
+	unsigned int i;
+	unsigned long ops = w->ops; /* avoid cacheline bouncing */
 
 	pthread_mutex_lock(&thread_lock);
 	threads_starting--;
@@ -74,7 +75,7 @@ static void *workerfn(void *arg)
 	pthread_mutex_unlock(&thread_lock);
 
 	do {
-		for (i = 0; i < nfutexes; i++, w->ops++) {
+		for (i = 0; i < nfutexes; i++, ops++) {
 			/*
 			 * We want the futex calls to fail in order to stress
 			 * the hashing of uaddr and not measure other steps,
@@ -88,6 +89,7 @@ static void *workerfn(void *arg)
 		}
 	}  while (!done);
 
+	w->ops = ops;
 	return NULL;
 }
 
diff --git a/tools/perf/bench/futex-lock-pi.c b/tools/perf/bench/futex-lock-pi.c
index 936d89d30483..7032e4643c65 100644
--- a/tools/perf/bench/futex-lock-pi.c
+++ b/tools/perf/bench/futex-lock-pi.c
@@ -75,6 +75,7 @@ static void toggle_done(int sig __maybe_unused,
 static void *workerfn(void *arg)
 {
 	struct worker *w = (struct worker *) arg;
+	unsigned long ops = w->ops;
 
 	pthread_mutex_lock(&thread_lock);
 	threads_starting--;
@@ -103,9 +104,10 @@ static void *workerfn(void *arg)
 		if (ret && !silent)
 			warn("thread %d: Could not unlock pi-lock for %p (%d)",
 			     w->tid, w->futex, ret);
-		w->ops++; /* account for thread's share of work */
+		ops++; /* account for thread's share of work */
 	}  while (!done);
 
+	w->ops = ops;
 	return NULL;
 }
 
-- 
2.6.6

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing
  2016-10-19 17:59     ` [PATCH] perf/bench-futex: Avoid worker cacheline bouncing Davidlohr Bueso
@ 2016-10-19 18:13       ` Sebastian Andrzej Siewior
  2016-10-19 18:41         ` Davidlohr Bueso
  0 siblings, 1 reply; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-19 18:13 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

On 2016-10-19 10:59:33 [-0700], Davidlohr Bueso wrote:
> Sebastian noted that overhead for worker thread ops (throughput)
> accounting was producing 'perf' to appear in the profiles, consuming
> a non-trivial (ie 13%) amount of CPU. This is due to cacheline
> bouncing due to the increment of w->ops. We can easily fix this by
> just working on a local copy and updating the actual worker once
> done running, and ready to show the program summary. There is no
> danger of the worker being concurrent, so we can trust that no stale
> value is being seen by another thread.
> 
> Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

> --- a/tools/perf/bench/futex-hash.c
> +++ b/tools/perf/bench/futex-hash.c
> @@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
> static void *workerfn(void *arg)
> {
> 	int ret;
> -	unsigned int i;
> 	struct worker *w = (struct worker *) arg;
> +	unsigned int i;
> +	unsigned long ops = w->ops; /* avoid cacheline bouncing */

we start at 0 so there is probably no need to init it with w->ops.

Sebastian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2 v2] perf bench futex: add NUMA support
  2016-10-17 15:33         ` [PATCH 2/2 v2] " Sebastian Andrzej Siewior
@ 2016-10-19 18:16           ` Davidlohr Bueso
  2016-10-19 18:37             ` Sebastian Andrzej Siewior
  2016-10-21  3:03           ` Davidlohr Bueso
  1 sibling, 1 reply; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-19 18:16 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Jiri Olsa, Peter Zijlstra,
	Ingo Molnar, linux-kernel, Davidlohr Bueso

On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:

>By default the application uses malloc() and all available CPUs. This
>patch introduces NUMA support which means:
>- memory is allocated node local via numa_alloc_local()
>- all CPUs of the specified NUMA node are used. This is also true if the
>  number of threads set is greater than the number of CPUs available on
>  this node.

Can't we just use numactl to bind cpus and memory to be node-local?

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2 v2] perf bench futex: add NUMA support
  2016-10-19 18:16           ` Davidlohr Bueso
@ 2016-10-19 18:37             ` Sebastian Andrzej Siewior
  2016-10-21  2:34               ` Davidlohr Bueso
  0 siblings, 1 reply; 16+ messages in thread
From: Sebastian Andrzej Siewior @ 2016-10-19 18:37 UTC (permalink / raw
  To: Davidlohr Bueso
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Jiri Olsa, Peter Zijlstra,
	Ingo Molnar, linux-kernel, Davidlohr Bueso

On 2016-10-19 11:16:16 [-0700], Davidlohr Bueso wrote:
> On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:
> 
> > By default the application uses malloc() and all available CPUs. This
> > patch introduces NUMA support which means:
> > - memory is allocated node local via numa_alloc_local()
> > - all CPUs of the specified NUMA node are used. This is also true if the
> >  number of threads set is greater than the number of CPUs available on
> >  this node.
> 
> Can't we just use numactl to bind cpus and memory to be node-local?

something like
	numactl --cpunodebind=$NODE --membind=$NODE perf …
?
This should work for memory however since we use
	pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
we would need to query the affinity mask, and deploy threads based on
that mask.
Using NUMA support within this bench-tool has also the side effect that
the output gives all the option used.

> Thanks,
> Davidlohr

Sebastian

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] perf/bench-futex: Avoid worker cacheline bouncing
  2016-10-19 18:13       ` Sebastian Andrzej Siewior
@ 2016-10-19 18:41         ` Davidlohr Bueso
  0 siblings, 0 replies; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-19 18:41 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Peter Zijlstra, Ingo Molnar,
	linux-kernel, Davidlohr Bueso

On Wed, 19 Oct 2016, Sebastian Andrzej Siewior wrote:

>On 2016-10-19 10:59:33 [-0700], Davidlohr Bueso wrote:
>> Sebastian noted that overhead for worker thread ops (throughput)
>> accounting was producing 'perf' to appear in the profiles, consuming
>> a non-trivial (ie 13%) amount of CPU. This is due to cacheline
>> bouncing due to the increment of w->ops. We can easily fix this by
>> just working on a local copy and updating the actual worker once
>> done running, and ready to show the program summary. There is no
>> danger of the worker being concurrent, so we can trust that no stale
>> value is being seen by another thread.
>>
>> Reported-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
>Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Thanks.

>
>> --- a/tools/perf/bench/futex-hash.c
>> +++ b/tools/perf/bench/futex-hash.c
>> @@ -63,8 +63,9 @@ static const char * const bench_futex_hash_usage[] = {
>> static void *workerfn(void *arg)
>> {
>> 	int ret;
>> -	unsigned int i;
>> 	struct worker *w = (struct worker *) arg;
>> +	unsigned int i;
>> +	unsigned long ops = w->ops; /* avoid cacheline bouncing */
>
>we start at 0 so there is probably no need to init it with w->ops.

Yeah, but I prefer having it this way - separates the init from the actual
work (although no big deal here). The extra load happens ncpu times, so
also no big deal.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2 v2] perf bench futex: add NUMA support
  2016-10-19 18:37             ` Sebastian Andrzej Siewior
@ 2016-10-21  2:34               ` Davidlohr Bueso
  0 siblings, 0 replies; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-21  2:34 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Jiri Olsa, Peter Zijlstra,
	Ingo Molnar, linux-kernel, Davidlohr Bueso

On Wed, 19 Oct 2016, Sebastian Andrzej Siewior wrote:

>On 2016-10-19 11:16:16 [-0700], Davidlohr Bueso wrote:
>> On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:
>>
>> > By default the application uses malloc() and all available CPUs. This
>> > patch introduces NUMA support which means:
>> > - memory is allocated node local via numa_alloc_local()
>> > - all CPUs of the specified NUMA node are used. This is also true if the
>> >  number of threads set is greater than the number of CPUs available on
>> >  this node.
>>
>> Can't we just use numactl to bind cpus and memory to be node-local?
>
>something like
>	numactl --cpunodebind=$NODE --membind=$NODE perf ???
>?

Yes.

>This should work for memory however since we use
>	pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
>we would need to query the affinity mask, and deploy threads based on
>that mask.

Ah right. I also considered getting rid of the affinity, but that would
probably hurt more than help (or at least alter) for non-numa options.

>Using NUMA support within this bench-tool has also the side effect that
>the output gives all the option used.

So if we are going to support the numa option for the benchmark, could you please
move the new code into futex.h instead of futex-hash.c? That way we can integrate
the support for the other futex programs as well.

Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 2/2 v2] perf bench futex: add NUMA support
  2016-10-17 15:33         ` [PATCH 2/2 v2] " Sebastian Andrzej Siewior
  2016-10-19 18:16           ` Davidlohr Bueso
@ 2016-10-21  3:03           ` Davidlohr Bueso
  1 sibling, 0 replies; 16+ messages in thread
From: Davidlohr Bueso @ 2016-10-21  3:03 UTC (permalink / raw
  To: Sebastian Andrzej Siewior
  Cc: Arnaldo Carvalho de Melo, Jiri Olsa, Jiri Olsa, Peter Zijlstra,
	Ingo Molnar, linux-kernel, Davidlohr Bueso

On Mon, 17 Oct 2016, Sebastian Andrzej Siewior wrote:

>+#ifdef HAVE_LIBNUMA_SUPPORT
>+#include <numa.h>
>+#endif

In futex.h

>+static int numa_node = -1;

In futex.h (perhaps rename to futexbench_numa_node?)

>+#ifndef HAVE_LIBNUMA_SUPPORT
>+static int numa_run_on_node(int node __maybe_unused) { return 0; }
>+static int numa_node_of_cpu(int node __maybe_unused) { return 0; }
>+static void *numa_alloc_local(size_t size) { return malloc(size); }
>+static void numa_free(void *p, size_t size __maybe_unused) { return free(p); }
>+#endif
>+
>+static bool cpu_is_local(int cpu)
>+{
>+	if (numa_node < 0)
>+		return true;
>+	if (numa_node_of_cpu(cpu) == numa_node)
>+		return true;
>+	return false;
>+}

In futex.h

>-	if (!nthreads) /* default to the number of CPUs */
>-		nthreads = ncpus;
>+	if (!nthreads) {
>+		/* default to the number of CPUs per NUMA node */

This comment should go...

>+		if (numa_node < 0) {
>+			nthreads = ncpus;
>+		} else {

here.

>+			for (i = 0; i < ncpus; i++) {
>+				if (cpu_is_local(i))
>+					nthreads++;
>+			}
>+			if (!nthreads)
>+				err(EXIT_FAILURE, "No online CPUs for this node");
>+		}
>+	} else {
>+		int cpu_available = 0;
>
>-	worker = calloc(nthreads, sizeof(*worker));
>+		for (i = 0; i < ncpus && !cpu_available; i++) {
>+			if (cpu_is_local(i))
>+				cpu_available = 1;
>+		}

Is this really necessary? If the user passes the number of threads, then we shouldn't
care about ncpus; we just run all the threads on the specified node. Wouldn't the
below numa_run_on_node() ensure that the node is not, for example, CPU-less?

>+		if (!cpu_available)
>+			err(EXIT_FAILURE, "No online CPUs for this node");
>+	}
>+
>+	if (numa_node >= 0) {
>+		ret = numa_run_on_node(numa_node);
>+		if (ret < 0)
>+			err(EXIT_FAILURE, "numa_run_on_node");


Thanks,
Davidlohr

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [tip:perf/core] perf bench futex: Cache align the worker struct
  2016-10-16 19:08 [PATCH 1/2] perf bench futex: cache align the worer struct Sebastian Andrzej Siewior
  2016-10-16 19:08 ` [PATCH 2/2] perf bench futex: add NUMA support Sebastian Andrzej Siewior
  2016-10-18  1:09 ` [PATCH 1/2] perf bench futex: cache align the worer struct Davidlohr Bueso
@ 2016-10-24 19:06 ` tip-bot for Sebastian Andrzej Siewior
  2 siblings, 0 replies; 16+ messages in thread
From: tip-bot for Sebastian Andrzej Siewior @ 2016-10-24 19:06 UTC (permalink / raw
  To: linux-tip-commits
  Cc: acme, peterz, tglx, dbueso, hpa, mingo, bigeasy, linux-kernel

Commit-ID:  34b753007d646482a4125a7095e1d1986d395f95
Gitweb:     http://git.kernel.org/tip/34b753007d646482a4125a7095e1d1986d395f95
Author:     Sebastian Andrzej Siewior <bigeasy@linutronix.de>
AuthorDate: Sun, 16 Oct 2016 21:08:02 +0200
Committer:  Arnaldo Carvalho de Melo <acme@redhat.com>
CommitDate: Mon, 24 Oct 2016 11:07:45 -0300

perf bench futex: Cache align the worker struct

It popped up in perf testing that the worker consumes some amount of
CPU. It boils down to the increment of `ops` which causes cache line
bouncing between the individual threads.

This patch aligns the struct by 256 bytes to ensure that not a cache
line is shared among CPUs. 128 byte is the x86 worst case and grep says
that L1_CACHE_SHIFT is set to 8 on s390.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Davidlohr Bueso <dbueso@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20161016190803.3392-1-bigeasy@linutronix.de
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
---
 tools/perf/bench/futex-hash.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
index 8024cd5..d9e5e80 100644
--- a/tools/perf/bench/futex-hash.c
+++ b/tools/perf/bench/futex-hash.c
@@ -39,12 +39,15 @@ static unsigned int threads_starting;
 static struct stats throughput_stats;
 static pthread_cond_t thread_parent, thread_worker;
 
+#define SMP_CACHE_BYTES 256
+#define __cacheline_aligned __attribute__ ((aligned (SMP_CACHE_BYTES)))
+
 struct worker {
 	int tid;
 	u_int32_t *futex;
 	pthread_t thread;
 	unsigned long ops;
-};
+} __cacheline_aligned;
 
 static const struct option options[] = {
 	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),

^ permalink raw reply related	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2016-10-24 19:07 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2016-10-16 19:08 [PATCH 1/2] perf bench futex: cache align the worer struct Sebastian Andrzej Siewior
2016-10-16 19:08 ` [PATCH 2/2] perf bench futex: add NUMA support Sebastian Andrzej Siewior
2016-10-17 14:38   ` Arnaldo Carvalho de Melo
2016-10-17 15:01     ` Jiri Olsa
2016-10-17 15:04       ` Arnaldo Carvalho de Melo
2016-10-17 15:33         ` [PATCH 2/2 v2] " Sebastian Andrzej Siewior
2016-10-19 18:16           ` Davidlohr Bueso
2016-10-19 18:37             ` Sebastian Andrzej Siewior
2016-10-21  2:34               ` Davidlohr Bueso
2016-10-21  3:03           ` Davidlohr Bueso
2016-10-18  1:09 ` [PATCH 1/2] perf bench futex: cache align the worer struct Davidlohr Bueso
2016-10-19 13:07   ` Sebastian Andrzej Siewior
2016-10-19 17:59     ` [PATCH] perf/bench-futex: Avoid worker cacheline bouncing Davidlohr Bueso
2016-10-19 18:13       ` Sebastian Andrzej Siewior
2016-10-19 18:41         ` Davidlohr Bueso
2016-10-24 19:06 ` [tip:perf/core] perf bench futex: Cache align the worker struct tip-bot for Sebastian Andrzej Siewior

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).