From 05baa0db8a932d42443c0271a4c661078e741a42 Mon Sep 17 00:00:00 2001 From: Benjamin Kraft Date: Thu, 21 Sep 2023 18:08:22 +0200 Subject: [PATCH] gpu acceleration more efficient --- shaders/compute.glsl | 47 ++++++++++++------------- src/MainWindow.cpp | 14 ++++++++ src/MainWindow.h | 2 +- src/Simulation.cpp | 82 +++++++++++++++++++++++++------------------- src/Simulation.h | 5 ++- 5 files changed, 89 insertions(+), 61 deletions(-) diff --git a/shaders/compute.glsl b/shaders/compute.glsl index 1b7a7d7..400efbf 100644 --- a/shaders/compute.glsl +++ b/shaders/compute.glsl @@ -1,6 +1,9 @@ #version 430 #extension GL_ARB_gpu_shader_fp64 : enable -layout (local_size_x=50, local_size_y=1, local_size_z=1) in; + +#define MAX_SEGMENTS 32 + +layout (local_size_x=MAX_SEGMENTS, local_size_y=1, local_size_z=1) in; layout(std430, binding = 0) buffer PositionBuffer { dvec2 X[]; @@ -10,8 +13,8 @@ layout(std430, binding = 1) buffer VelocityBuffer { dvec2 V[]; }; -layout(std430, binding = 2) buffer MassBuffer { - double M[]; +layout(std430, binding = 2) buffer InverseMassBuffer { + double W[]; }; layout(std430, binding = 3) buffer LengthBuffer { @@ -30,11 +33,12 @@ uniform double h; uniform double gravity; uniform uint substeps; -shared dvec2 P[50]; +shared dvec2 P[MAX_SEGMENTS]; +// distance constraint void solveConstraint(uint segmentId, uint gIndex){ - double w1 = segmentId == 0 ? 0.0 : 1.0 / M[gIndex - 1]; - double w2 = 1.0 / M[gIndex]; + double w1 = segmentId == 0 ? 0.0 : W[gIndex - 1]; + double w2 = W[gIndex]; dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId]; dvec2 n = normalize(s); @@ -48,38 +52,35 @@ void solveConstraint(uint segmentId, uint gIndex){ P[segmentId] += deltaP2; } +// No need for synchronization, because each pendulum gets its own warp void main() { uint pendulumId = gl_WorkGroupID.x; uint i_off = indices[pendulumId]; - uint count = segmentCounts[pendulumId]; + uint segmentCount = segmentCounts[pendulumId]; uint localId = gl_LocalInvocationID.x; - uint index = i_off + localId; + uint gIndex = i_off + localId; - if (localId >= count) + // discard unneeded local threads for segments + if (localId >= segmentCount) return; + // substepping is not parallelizable for (int k = 0; k < substeps; k++){ // explicit integration with gravity as external force - V[index] = V[index] + dvec2(0, -gravity * h); - P[localId] = X[index] + V[index] * h; - - memoryBarrierShared(); + V[gIndex] = V[gIndex] + dvec2(0, -gravity * h); + P[localId] = X[gIndex] + V[gIndex] * h; + // every segment depends on its neighbour, so we can not parallelize all constraints + // only parallelize those, who are independent of each other, skipping every second segment if (localId % 2 == 0){ solveConstraint(localId, i_off + localId); - if (localId + 1 < count){ - memoryBarrierShared(); - solveConstraint(localId + 1, i_off + localId + 1); - } + if (localId + 1 < segmentCount) + solveConstraint(localId + 1, gIndex + 1); } - barrier(); - // update position and velocity - V[index] = (P[localId] - X[index]) / h; - X[index] = P[localId]; - - memoryBarrierBuffer(); + V[gIndex] = (P[localId] - X[gIndex]) / h; + X[gIndex] = P[localId]; } } diff --git a/src/MainWindow.cpp b/src/MainWindow.cpp index ab7b54c..39356e2 100644 --- a/src/MainWindow.cpp +++ b/src/MainWindow.cpp @@ -380,6 +380,20 @@ QWidget * MainWindow::buildSimulationUI() { lyt->addWidget(showMasses); } + // GPU-Acceleration + { + auto useGPU = new QCheckBox("Use GPU-Acceleration"); + connect(useGPU, &QCheckBox::stateChanged, simulation, &Simulation::useGPUChanged); + connect(simulation, &Simulation::gpuNotSupported, [useGPU](const std::string &message){ + useGPU->setCheckState(Qt::Unchecked); + useGPU->setEnabled(false); + useGPU->setToolTip(QString::fromStdString(message)); + }); + useGPU->setCheckState(Qt::Unchecked); + + lyt->addWidget(useGPU); + } + return w; } diff --git a/src/MainWindow.h b/src/MainWindow.h index c69fcdd..0871550 100644 --- a/src/MainWindow.h +++ b/src/MainWindow.h @@ -24,7 +24,7 @@ private: GLWidget * glWidget = nullptr; - static const int MaxSegments = 50; + static const int MaxSegments = 32; std::vector masses, lengths; int segments = 0; diff --git a/src/Simulation.cpp b/src/Simulation.cpp index 96a4f5e..9d77b9d 100644 --- a/src/Simulation.cpp +++ b/src/Simulation.cpp @@ -32,7 +32,10 @@ void Simulation::initialize() { context->makeCurrent(surface); initializeOpenGLFunctions(); if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){ - printf("Double precision not supported by OpenGL!\n"); + std::string message = "Double precision not supported by OpenGL! Falling back to CPU-Multithreading."; + printf("%s\n", message.c_str()); + useGPUAcceleration = false; + emit gpuNotSupported(message); } } @@ -58,7 +61,7 @@ void Simulation::initialize() { glGenBuffers(6, &SSBO.positions); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.masses); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.invMasses); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts); @@ -70,7 +73,7 @@ void Simulation::initialize() { void Simulation::updateGPUData() { std::vector positions; std::vector velocities; - std::vector masses; + std::vector invMasses; std::vector lengths; std::vector indices; std::vector segmentCounts; @@ -81,18 +84,18 @@ void Simulation::updateGPUData() { currentIndex += p->X.size(); segmentCounts.push_back(p->X.size()); for (size_t i = 0; i < p->X.size(); i++){ - positions.push_back(GLdouble(p->X[i].x)); - positions.push_back(GLdouble(p->X[i].y)); - velocities.push_back(GLdouble(p->V[i].x)); - velocities.push_back(GLdouble(p->V[i].y)); - masses.push_back(GLfloat(p->M[i])); - lengths.push_back(GLfloat(p->L[i])); + positions.push_back(p->X[i].x); + positions.push_back(p->X[i].y); + velocities.push_back(p->V[i].x); + velocities.push_back(p->V[i].y); + invMasses.push_back(1.0 / p->M[i]); + lengths.push_back(p->L[i]); } } auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble)); auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble)); - auto massSize = GLsizeiptr(masses.size() * sizeof(GLdouble)); + auto invMassSize = GLsizeiptr(invMasses.size() * sizeof(GLdouble)); auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble)); auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint)); auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint)); @@ -101,8 +104,8 @@ void Simulation::updateGPUData() { glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities); glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW); - glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.masses); - glBufferData(GL_SHADER_STORAGE_BUFFER, massSize, masses.data(), GL_STATIC_DRAW); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.invMasses); + glBufferData(GL_SHADER_STORAGE_BUFFER, invMassSize, invMasses.data(), GL_STATIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths); glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices); @@ -125,33 +128,12 @@ void Simulation::update() { double newPotentialEnergy = 0; double newKineticEnergy = 0; - auto t1 = system_clock::now(); - - /* - // OLD with CPUs - { - #pragma omp parallel for - for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) // not ranged based for msvc - for (int k = 0; k < substeps; k++) - pendula[i]->update(h, gravity); - double localPotentialEnergy = pendula[i]->potentialEnergy(gravity); - double localKineticEnergy = pendula[i]->kineticEnergy(); - #pragma omp atomic - newPotentialEnergy += localPotentialEnergy; - #pragma omp atomic - newKineticEnergy += localKineticEnergy; - } - }*/ - - - // NEW with GPUs - { + if (useGPUAcceleration) { program->bind(); glUniform1d(program->uniformLocation("h"), h); glUniform1d(program->uniformLocation("gravity"), gravity); glUniform1ui(program->uniformLocation("substeps"), substeps); - glDispatchCompute(pendula.size(), 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); @@ -168,6 +150,7 @@ void Simulation::update() { } glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); } + // Read updated velocities { glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities); @@ -181,10 +164,31 @@ void Simulation::update() { } glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); } - } - std::cout << duration_cast(system_clock::now() - t1).count() << std::endl; + #pragma omp parallel for + for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc + double localPotentialEnergy = pendula[i]->potentialEnergy(gravity); + double localKineticEnergy = pendula[i]->kineticEnergy(); + #pragma omp atomic + newPotentialEnergy += localPotentialEnergy; + #pragma omp atomic + newKineticEnergy += localKineticEnergy; + } + } else { + // CPU-Multithreading + #pragma omp parallel for + for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc + for (int k = 0; k < substeps; k++) + pendula[i]->update(h, gravity); + double localPotentialEnergy = pendula[i]->potentialEnergy(gravity); + double localKineticEnergy = pendula[i]->kineticEnergy(); + #pragma omp atomic + newPotentialEnergy += localPotentialEnergy; + #pragma omp atomic + newKineticEnergy += localKineticEnergy; + } + } potentialEnergy = newPotentialEnergy; kineticEnergy = newKineticEnergy; @@ -229,3 +233,9 @@ void Simulation::updateEnergy() { } } +void Simulation::useGPUChanged(int state) { + useGPUAcceleration = state == Qt::Checked; + if (useGPUAcceleration) + updateGPUData(); +} + diff --git a/src/Simulation.h b/src/Simulation.h index 6222ad7..6af7bbe 100644 --- a/src/Simulation.h +++ b/src/Simulation.h @@ -32,11 +32,13 @@ public: void initialize(); void updateEnergy(); signals: + void gpuNotSupported(const std::string &message); void layoutChanged(); void positionChanged(); public slots: void clearPendula(); void addPendula(const std::vector &add); + void useGPUChanged(int); private slots: void update(); private: @@ -50,11 +52,12 @@ private: struct SSBO { GLuint positions; GLuint velocities; - GLuint masses; + GLuint invMasses; GLuint lengths; GLuint indices; GLuint segmentCounts; } SSBO; QTimer * timer; int updateInterval = 16; + bool useGPUAcceleration = false; }; \ No newline at end of file