gpu acceleration more efficient

main v1.3
Benjamin Kraft 1 year ago
parent 08cabd9d8e
commit 05baa0db8a
  1. 47
      shaders/compute.glsl
  2. 14
      src/MainWindow.cpp
  3. 2
      src/MainWindow.h
  4. 82
      src/Simulation.cpp
  5. 5
      src/Simulation.h

@ -1,6 +1,9 @@
#version 430 #version 430
#extension GL_ARB_gpu_shader_fp64 : enable #extension GL_ARB_gpu_shader_fp64 : enable
layout (local_size_x=50, local_size_y=1, local_size_z=1) in;
#define MAX_SEGMENTS 32
layout (local_size_x=MAX_SEGMENTS, local_size_y=1, local_size_z=1) in;
layout(std430, binding = 0) buffer PositionBuffer { layout(std430, binding = 0) buffer PositionBuffer {
dvec2 X[]; dvec2 X[];
@ -10,8 +13,8 @@ layout(std430, binding = 1) buffer VelocityBuffer {
dvec2 V[]; dvec2 V[];
}; };
layout(std430, binding = 2) buffer MassBuffer { layout(std430, binding = 2) buffer InverseMassBuffer {
double M[]; double W[];
}; };
layout(std430, binding = 3) buffer LengthBuffer { layout(std430, binding = 3) buffer LengthBuffer {
@ -30,11 +33,12 @@ uniform double h;
uniform double gravity; uniform double gravity;
uniform uint substeps; uniform uint substeps;
shared dvec2 P[50]; shared dvec2 P[MAX_SEGMENTS];
// distance constraint
void solveConstraint(uint segmentId, uint gIndex){ void solveConstraint(uint segmentId, uint gIndex){
double w1 = segmentId == 0 ? 0.0 : 1.0 / M[gIndex - 1]; double w1 = segmentId == 0 ? 0.0 : W[gIndex - 1];
double w2 = 1.0 / M[gIndex]; double w2 = W[gIndex];
dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId]; dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId];
dvec2 n = normalize(s); dvec2 n = normalize(s);
@ -48,38 +52,35 @@ void solveConstraint(uint segmentId, uint gIndex){
P[segmentId] += deltaP2; P[segmentId] += deltaP2;
} }
// No need for synchronization, because each pendulum gets its own warp
void main() { void main() {
uint pendulumId = gl_WorkGroupID.x; uint pendulumId = gl_WorkGroupID.x;
uint i_off = indices[pendulumId]; uint i_off = indices[pendulumId];
uint count = segmentCounts[pendulumId]; uint segmentCount = segmentCounts[pendulumId];
uint localId = gl_LocalInvocationID.x; uint localId = gl_LocalInvocationID.x;
uint index = i_off + localId; uint gIndex = i_off + localId;
if (localId >= count) // discard unneeded local threads for segments
if (localId >= segmentCount)
return; return;
// substepping is not parallelizable
for (int k = 0; k < substeps; k++){ for (int k = 0; k < substeps; k++){
// explicit integration with gravity as external force // explicit integration with gravity as external force
V[index] = V[index] + dvec2(0, -gravity * h); V[gIndex] = V[gIndex] + dvec2(0, -gravity * h);
P[localId] = X[index] + V[index] * h; P[localId] = X[gIndex] + V[gIndex] * h;
memoryBarrierShared();
// every segment depends on its neighbour, so we can not parallelize all constraints
// only parallelize those, who are independent of each other, skipping every second segment
if (localId % 2 == 0){ if (localId % 2 == 0){
solveConstraint(localId, i_off + localId); solveConstraint(localId, i_off + localId);
if (localId + 1 < count){ if (localId + 1 < segmentCount)
memoryBarrierShared(); solveConstraint(localId + 1, gIndex + 1);
solveConstraint(localId + 1, i_off + localId + 1);
}
} }
barrier();
// update position and velocity // update position and velocity
V[index] = (P[localId] - X[index]) / h; V[gIndex] = (P[localId] - X[gIndex]) / h;
X[index] = P[localId]; X[gIndex] = P[localId];
memoryBarrierBuffer();
} }
} }

@ -380,6 +380,20 @@ QWidget * MainWindow::buildSimulationUI() {
lyt->addWidget(showMasses); lyt->addWidget(showMasses);
} }
// GPU-Acceleration
{
auto useGPU = new QCheckBox("Use GPU-Acceleration");
connect(useGPU, &QCheckBox::stateChanged, simulation, &Simulation::useGPUChanged);
connect(simulation, &Simulation::gpuNotSupported, [useGPU](const std::string &message){
useGPU->setCheckState(Qt::Unchecked);
useGPU->setEnabled(false);
useGPU->setToolTip(QString::fromStdString(message));
});
useGPU->setCheckState(Qt::Unchecked);
lyt->addWidget(useGPU);
}
return w; return w;
} }

@ -24,7 +24,7 @@ private:
GLWidget * glWidget = nullptr; GLWidget * glWidget = nullptr;
static const int MaxSegments = 50; static const int MaxSegments = 32;
std::vector<double> masses, lengths; std::vector<double> masses, lengths;
int segments = 0; int segments = 0;

@ -32,7 +32,10 @@ void Simulation::initialize() {
context->makeCurrent(surface); context->makeCurrent(surface);
initializeOpenGLFunctions(); initializeOpenGLFunctions();
if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){ if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){
printf("Double precision not supported by OpenGL!\n"); std::string message = "Double precision not supported by OpenGL! Falling back to CPU-Multithreading.";
printf("%s\n", message.c_str());
useGPUAcceleration = false;
emit gpuNotSupported(message);
} }
} }
@ -58,7 +61,7 @@ void Simulation::initialize() {
glGenBuffers(6, &SSBO.positions); glGenBuffers(6, &SSBO.positions);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.masses); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.invMasses);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts);
@ -70,7 +73,7 @@ void Simulation::initialize() {
void Simulation::updateGPUData() { void Simulation::updateGPUData() {
std::vector<GLdouble> positions; std::vector<GLdouble> positions;
std::vector<GLdouble> velocities; std::vector<GLdouble> velocities;
std::vector<GLdouble> masses; std::vector<GLdouble> invMasses;
std::vector<GLdouble> lengths; std::vector<GLdouble> lengths;
std::vector<GLuint> indices; std::vector<GLuint> indices;
std::vector<GLuint> segmentCounts; std::vector<GLuint> segmentCounts;
@ -81,18 +84,18 @@ void Simulation::updateGPUData() {
currentIndex += p->X.size(); currentIndex += p->X.size();
segmentCounts.push_back(p->X.size()); segmentCounts.push_back(p->X.size());
for (size_t i = 0; i < p->X.size(); i++){ for (size_t i = 0; i < p->X.size(); i++){
positions.push_back(GLdouble(p->X[i].x)); positions.push_back(p->X[i].x);
positions.push_back(GLdouble(p->X[i].y)); positions.push_back(p->X[i].y);
velocities.push_back(GLdouble(p->V[i].x)); velocities.push_back(p->V[i].x);
velocities.push_back(GLdouble(p->V[i].y)); velocities.push_back(p->V[i].y);
masses.push_back(GLfloat(p->M[i])); invMasses.push_back(1.0 / p->M[i]);
lengths.push_back(GLfloat(p->L[i])); lengths.push_back(p->L[i]);
} }
} }
auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble)); auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble));
auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble)); auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble));
auto massSize = GLsizeiptr(masses.size() * sizeof(GLdouble)); auto invMassSize = GLsizeiptr(invMasses.size() * sizeof(GLdouble));
auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble)); auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble));
auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint)); auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint));
auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint)); auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint));
@ -101,8 +104,8 @@ void Simulation::updateGPUData() {
glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.masses); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.invMasses);
glBufferData(GL_SHADER_STORAGE_BUFFER, massSize, masses.data(), GL_STATIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, invMassSize, invMasses.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths);
glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW); glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices);
@ -125,33 +128,12 @@ void Simulation::update() {
double newPotentialEnergy = 0; double newPotentialEnergy = 0;
double newKineticEnergy = 0; double newKineticEnergy = 0;
auto t1 = system_clock::now(); if (useGPUAcceleration) {
/*
// OLD with CPUs
{
#pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) // not ranged based for msvc
for (int k = 0; k < substeps; k++)
pendula[i]->update(h, gravity);
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
}*/
// NEW with GPUs
{
program->bind(); program->bind();
glUniform1d(program->uniformLocation("h"), h); glUniform1d(program->uniformLocation("h"), h);
glUniform1d(program->uniformLocation("gravity"), gravity); glUniform1d(program->uniformLocation("gravity"), gravity);
glUniform1ui(program->uniformLocation("substeps"), substeps); glUniform1ui(program->uniformLocation("substeps"), substeps);
glDispatchCompute(pendula.size(), 1, 1); glDispatchCompute(pendula.size(), 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
@ -168,6 +150,7 @@ void Simulation::update() {
} }
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
} }
// Read updated velocities // Read updated velocities
{ {
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities); glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
@ -181,10 +164,31 @@ void Simulation::update() {
} }
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER); glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
} }
}
std::cout << duration_cast<microseconds>(system_clock::now() - t1).count() << std::endl; #pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
} else {
// CPU-Multithreading
#pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
for (int k = 0; k < substeps; k++)
pendula[i]->update(h, gravity);
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
}
potentialEnergy = newPotentialEnergy; potentialEnergy = newPotentialEnergy;
kineticEnergy = newKineticEnergy; kineticEnergy = newKineticEnergy;
@ -229,3 +233,9 @@ void Simulation::updateEnergy() {
} }
} }
void Simulation::useGPUChanged(int state) {
useGPUAcceleration = state == Qt::Checked;
if (useGPUAcceleration)
updateGPUData();
}

@ -32,11 +32,13 @@ public:
void initialize(); void initialize();
void updateEnergy(); void updateEnergy();
signals: signals:
void gpuNotSupported(const std::string &message);
void layoutChanged(); void layoutChanged();
void positionChanged(); void positionChanged();
public slots: public slots:
void clearPendula(); void clearPendula();
void addPendula(const std::vector<Pendulum *> &add); void addPendula(const std::vector<Pendulum *> &add);
void useGPUChanged(int);
private slots: private slots:
void update(); void update();
private: private:
@ -50,11 +52,12 @@ private:
struct SSBO { struct SSBO {
GLuint positions; GLuint positions;
GLuint velocities; GLuint velocities;
GLuint masses; GLuint invMasses;
GLuint lengths; GLuint lengths;
GLuint indices; GLuint indices;
GLuint segmentCounts; GLuint segmentCounts;
} SSBO; } SSBO;
QTimer * timer; QTimer * timer;
int updateInterval = 16; int updateInterval = 16;
bool useGPUAcceleration = false;
}; };
Loading…
Cancel
Save