Parallelize _newCPUGraph() and _newTaskGraph() calls to dramatically speed up graph rendering particularly for traces from very large systems. OpenMP technically is a new dependency here, but it's deeply embedded with GCC toolchains, as long as your GCC is not older than v4.9, the libgomp library that comes with it will work. Signed-off-by: Libo Chen <libo.chen@xxxxxxxxxx> --- CMakeLists.txt | 6 ++++++ src/KsGLWidget.cpp | 25 +++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c36d757..8d1090a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,12 @@ set(EXECUTABLE_OUTPUT_PATH "${KS_DIR}/bin") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -pthread -fPIC -fno-common") +find_package(OpenMP 3.2.5) +if (OPENMP_FOUND) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") +endif(OPENMP_FOUND) + set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) diff --git a/src/KsGLWidget.cpp b/src/KsGLWidget.cpp index 9311d98..004d64b 100644 --- a/src/KsGLWidget.cpp +++ b/src/KsGLWidget.cpp @@ -13,6 +13,9 @@ #include <GL/glut.h> #include <GL/gl.h> +// OpenMP +#include <omp.h> + // KernelShark #include "libkshark-plugin.h" #include "KsGLWidget.hpp" @@ -688,25 +691,43 @@ void KsGLWidget::_makeGraphs() return graph; }; + omp_set_num_threads(omp_get_num_procs()); for (auto it = _streamPlots.begin(); it != _streamPlots.end(); ++it) { sd = it.key(); + QVector<KsPlot::Graph *> cpuGraphs(it.value()._cpuList.count()); + QVector<KsPlot::Graph *> taskGraphs(it.value()._taskList.count()); + /* Create CPU graphs according to the cpuList. */ it.value()._cpuGraphs = {}; + #pragma omp parallel for for (auto const &cpu: it.value()._cpuList) { - g = lamAddGraph(sd, _newCPUGraph(sd, cpu), _vSpacing); + int idx = it.value()._cpuList.indexOf(cpu); + cpuGraphs[idx] = _newCPUGraph(sd, cpu); + } + QVectorIterator<KsPlot::Graph *> itCpuGraphs(cpuGraphs); + while (itCpuGraphs.hasNext()) { + g = lamAddGraph(sd, itCpuGraphs.next(), _vSpacing); it.value()._cpuGraphs.append(g); } /* Create Task graphs according to the taskList. */ it.value()._taskGraphs = {}; + #pragma omp parallel for for (auto const &pid: it.value()._taskList) { - g = lamAddGraph(sd, _newTaskGraph(sd, pid), _vSpacing); + int idx = it.value()._taskList.indexOf(pid); + taskGraphs[idx] = _newTaskGraph(sd, pid); + } + QVectorIterator<KsPlot::Graph *> itTaskGraphs(taskGraphs); + while (itTaskGraphs.hasNext()) { + g = lamAddGraph(sd, itTaskGraphs.next(), _vSpacing); it.value()._taskGraphs.append(g); } + } for (auto &c: _comboPlots) { int n = c.count(); + #pragma omp parallel for for (int i = 0; i < n; ++i) { sd = c[i]._streamId; if (c[i]._type & KSHARK_TASK_DRAW) { -- 2.46.2