gpu acceleration more efficient

main v1.3
Benjamin Kraft 1 year ago
parent 08cabd9d8e
commit 05baa0db8a
  1. 47
      shaders/compute.glsl
  2. 14
      src/MainWindow.cpp
  3. 2
      src/MainWindow.h
  4. 82
      src/Simulation.cpp
  5. 5
      src/Simulation.h

@ -1,6 +1,9 @@
#version 430
#extension GL_ARB_gpu_shader_fp64 : enable
layout (local_size_x=50, local_size_y=1, local_size_z=1) in;
#define MAX_SEGMENTS 32
layout (local_size_x=MAX_SEGMENTS, local_size_y=1, local_size_z=1) in;
layout(std430, binding = 0) buffer PositionBuffer {
dvec2 X[];
@ -10,8 +13,8 @@ layout(std430, binding = 1) buffer VelocityBuffer {
dvec2 V[];
};
layout(std430, binding = 2) buffer MassBuffer {
double M[];
layout(std430, binding = 2) buffer InverseMassBuffer {
double W[];
};
layout(std430, binding = 3) buffer LengthBuffer {
@ -30,11 +33,12 @@ uniform double h;
uniform double gravity;
uniform uint substeps;
shared dvec2 P[50];
shared dvec2 P[MAX_SEGMENTS];
// distance constraint
void solveConstraint(uint segmentId, uint gIndex){
double w1 = segmentId == 0 ? 0.0 : 1.0 / M[gIndex - 1];
double w2 = 1.0 / M[gIndex];
double w1 = segmentId == 0 ? 0.0 : W[gIndex - 1];
double w2 = W[gIndex];
dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId];
dvec2 n = normalize(s);
@ -48,38 +52,35 @@ void solveConstraint(uint segmentId, uint gIndex){
P[segmentId] += deltaP2;
}
// No need for synchronization, because each pendulum gets its own warp
void main() {
uint pendulumId = gl_WorkGroupID.x;
uint i_off = indices[pendulumId];
uint count = segmentCounts[pendulumId];
uint segmentCount = segmentCounts[pendulumId];
uint localId = gl_LocalInvocationID.x;
uint index = i_off + localId;
uint gIndex = i_off + localId;
if (localId >= count)
// discard unneeded local threads for segments
if (localId >= segmentCount)
return;
// substepping is not parallelizable
for (int k = 0; k < substeps; k++){
// explicit integration with gravity as external force
V[index] = V[index] + dvec2(0, -gravity * h);
P[localId] = X[index] + V[index] * h;
memoryBarrierShared();
V[gIndex] = V[gIndex] + dvec2(0, -gravity * h);
P[localId] = X[gIndex] + V[gIndex] * h;
// every segment depends on its neighbour, so we can not parallelize all constraints
// only parallelize those, who are independent of each other, skipping every second segment
if (localId % 2 == 0){
solveConstraint(localId, i_off + localId);
if (localId + 1 < count){
memoryBarrierShared();
solveConstraint(localId + 1, i_off + localId + 1);
}
if (localId + 1 < segmentCount)
solveConstraint(localId + 1, gIndex + 1);
}
barrier();
// update position and velocity
V[index] = (P[localId] - X[index]) / h;
X[index] = P[localId];
memoryBarrierBuffer();
V[gIndex] = (P[localId] - X[gIndex]) / h;
X[gIndex] = P[localId];
}
}

@ -380,6 +380,20 @@ QWidget * MainWindow::buildSimulationUI() {
lyt->addWidget(showMasses);
}
// GPU-Acceleration
{
auto useGPU = new QCheckBox("Use GPU-Acceleration");
connect(useGPU, &QCheckBox::stateChanged, simulation, &Simulation::useGPUChanged);
connect(simulation, &Simulation::gpuNotSupported, [useGPU](const std::string &message){
useGPU->setCheckState(Qt::Unchecked);
useGPU->setEnabled(false);
useGPU->setToolTip(QString::fromStdString(message));
});
useGPU->setCheckState(Qt::Unchecked);
lyt->addWidget(useGPU);
}
return w;
}

@ -24,7 +24,7 @@ private:
GLWidget * glWidget = nullptr;
static const int MaxSegments = 50;
static const int MaxSegments = 32;
std::vector<double> masses, lengths;
int segments = 0;

@ -32,7 +32,10 @@ void Simulation::initialize() {
context->makeCurrent(surface);
initializeOpenGLFunctions();
if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){
printf("Double precision not supported by OpenGL!\n");
std::string message = "Double precision not supported by OpenGL! Falling back to CPU-Multithreading.";
printf("%s\n", message.c_str());
useGPUAcceleration = false;
emit gpuNotSupported(message);
}
}
@ -58,7 +61,7 @@ void Simulation::initialize() {
glGenBuffers(6, &SSBO.positions);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.masses);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.invMasses);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts);
@ -70,7 +73,7 @@ void Simulation::initialize() {
void Simulation::updateGPUData() {
std::vector<GLdouble> positions;
std::vector<GLdouble> velocities;
std::vector<GLdouble> masses;
std::vector<GLdouble> invMasses;
std::vector<GLdouble> lengths;
std::vector<GLuint> indices;
std::vector<GLuint> segmentCounts;
@ -81,18 +84,18 @@ void Simulation::updateGPUData() {
currentIndex += p->X.size();
segmentCounts.push_back(p->X.size());
for (size_t i = 0; i < p->X.size(); i++){
positions.push_back(GLdouble(p->X[i].x));
positions.push_back(GLdouble(p->X[i].y));
velocities.push_back(GLdouble(p->V[i].x));
velocities.push_back(GLdouble(p->V[i].y));
masses.push_back(GLfloat(p->M[i]));
lengths.push_back(GLfloat(p->L[i]));
positions.push_back(p->X[i].x);
positions.push_back(p->X[i].y);
velocities.push_back(p->V[i].x);
velocities.push_back(p->V[i].y);
invMasses.push_back(1.0 / p->M[i]);
lengths.push_back(p->L[i]);
}
}
auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble));
auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble));
auto massSize = GLsizeiptr(masses.size() * sizeof(GLdouble));
auto invMassSize = GLsizeiptr(invMasses.size() * sizeof(GLdouble));
auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble));
auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint));
auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint));
@ -101,8 +104,8 @@ void Simulation::updateGPUData() {
glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.masses);
glBufferData(GL_SHADER_STORAGE_BUFFER, massSize, masses.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.invMasses);
glBufferData(GL_SHADER_STORAGE_BUFFER, invMassSize, invMasses.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths);
glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices);
@ -125,33 +128,12 @@ void Simulation::update() {
double newPotentialEnergy = 0;
double newKineticEnergy = 0;
auto t1 = system_clock::now();
/*
// OLD with CPUs
{
#pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) // not ranged based for msvc
for (int k = 0; k < substeps; k++)
pendula[i]->update(h, gravity);
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
}*/
// NEW with GPUs
{
if (useGPUAcceleration) {
program->bind();
glUniform1d(program->uniformLocation("h"), h);
glUniform1d(program->uniformLocation("gravity"), gravity);
glUniform1ui(program->uniformLocation("substeps"), substeps);
glDispatchCompute(pendula.size(), 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
@ -168,6 +150,7 @@ void Simulation::update() {
}
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
}
// Read updated velocities
{
glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
@ -181,10 +164,31 @@ void Simulation::update() {
}
glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
}
}
std::cout << duration_cast<microseconds>(system_clock::now() - t1).count() << std::endl;
#pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
} else {
// CPU-Multithreading
#pragma omp parallel for
for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
for (int k = 0; k < substeps; k++)
pendula[i]->update(h, gravity);
double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
double localKineticEnergy = pendula[i]->kineticEnergy();
#pragma omp atomic
newPotentialEnergy += localPotentialEnergy;
#pragma omp atomic
newKineticEnergy += localKineticEnergy;
}
}
potentialEnergy = newPotentialEnergy;
kineticEnergy = newKineticEnergy;
@ -229,3 +233,9 @@ void Simulation::updateEnergy() {
}
}
void Simulation::useGPUChanged(int state) {
useGPUAcceleration = state == Qt::Checked;
if (useGPUAcceleration)
updateGPUData();
}

@ -32,11 +32,13 @@ public:
void initialize();
void updateEnergy();
signals:
void gpuNotSupported(const std::string &message);
void layoutChanged();
void positionChanged();
public slots:
void clearPendula();
void addPendula(const std::vector<Pendulum *> &add);
void useGPUChanged(int);
private slots:
void update();
private:
@ -50,11 +52,12 @@ private:
struct SSBO {
GLuint positions;
GLuint velocities;
GLuint masses;
GLuint invMasses;
GLuint lengths;
GLuint indices;
GLuint segmentCounts;
} SSBO;
QTimer * timer;
int updateInterval = 16;
bool useGPUAcceleration = false;
};
Loading…
Cancel
Save