Re: [PATCH] kernel-shark: Multi-thread the computaion of CPU graph

Yordan Karadzhov <y.karadz@xxxxxxxxx> · Sat, 28 Sep 2024 17:32:37 +0300

Hi Libo,


I like the idea of the patch, however a bit of extra work is needed 

before merging it. See my comments in the code below.


On 9/23/24 12:09, Libo Chen wrote:

Parallelize _newCPUGraph() calls to dramatically speed up
graph rendering particularly for traces from very large systems.

OpenMP technically is a new dependency here, but it's so embedded
with GCC toolchains, as long as your GCC is not older than v4.9,
the libgomp library that comes with it will work.

Signed-off-by: Libo Chen <libo.chen@xxxxxxxxxx>
---
  CMakeLists.txt     |  5 +++++
  src/KsGLWidget.cpp | 15 ++++++++++++++-
  2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c36d757..8dd9ff9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,6 +84,11 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin")
  set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")
  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common")

  

+find_package(OpenMP 3.2.5)

+if (OPENMP_FOUND)
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")


The "if" must be closed.


+
  set(CMAKE_CXX_STANDARD 17)
  set(CMAKE_CXX_STANDARD_REQUIRED ON)
  set(CMAKE_CXX_EXTENSIONS OFF)
diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp
index 9311d98..00a1951 100644
--- a/src/KsGLWidget.cpp
+++ b/src/KsGLWidget.cpp
@@ -13,6 +13,9 @@
  #include <GL/glut.h>
  #include <GL/gl.h>

  

+// OpenMP

+#include <omp.h>
+
  // KernelShark
  #include "libkshark-plugin.h"
  #include "KsGLWidget.hpp"
@@ -690,10 +693,20 @@ void KsGLWidget::_makeGraphs()

  

  	for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) {

  		sd = it.key();
+		QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count());
+		QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count());
+
  		/* Create CPU graphs according to the cpuList. */
  		it.value()._cpuGraphs = {};
+		omp_set_num_threads(omp_get_num_procs());



Not sure how setting the thread number works in opm, but I would guess 

that you have to do it just once. If this is the case, please move the 

the line above to the constructor. Note that _makeGraphs() gets executed 

almost every time you do something in the GUI (zoom, shift, select 

event, ...)



+		#pragma omp parallel for
  		for (auto const &cpu: it.value()._cpuList) {
-			g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing);
+			int idx = it.value()._cpuList.indexOf(cpu);
+			cpuGraphs[idx] = _newCPUGraph(sd, cpu);
+		}
+		QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs);
+		while (itCpuGraphs.hasNext()) {
+			g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing);
  			it.value()._cpuGraphs.append(g);
  		}

  



I wonder why you add this optimization only for the CPU graphs? Please 

do the same for the Task and Combo graphs as well.


Thanks!
Yordan