From 05baa0db8a932d42443c0271a4c661078e741a42 Mon Sep 17 00:00:00 2001
From: Benjamin Kraft <benjamin.kraft@online.de>
Date: Thu, 21 Sep 2023 18:08:22 +0200
Subject: [PATCH] gpu acceleration more efficient

---
 shaders/compute.glsl | 47 ++++++++++++-------------
 src/MainWindow.cpp   | 14 ++++++++
 src/MainWindow.h     |  2 +-
 src/Simulation.cpp   | 82 +++++++++++++++++++++++++-------------------
 src/Simulation.h     |  5 ++-
 5 files changed, 89 insertions(+), 61 deletions(-)

diff --git a/shaders/compute.glsl b/shaders/compute.glsl
index 1b7a7d7..400efbf 100644
--- a/shaders/compute.glsl
+++ b/shaders/compute.glsl
@@ -1,6 +1,9 @@
 #version 430
 #extension GL_ARB_gpu_shader_fp64 : enable
-layout (local_size_x=50, local_size_y=1, local_size_z=1) in;
+
+#define MAX_SEGMENTS 32
+
+layout (local_size_x=MAX_SEGMENTS, local_size_y=1, local_size_z=1) in;
 
 layout(std430, binding = 0) buffer PositionBuffer {
     dvec2 X[];
@@ -10,8 +13,8 @@ layout(std430, binding = 1) buffer VelocityBuffer {
     dvec2 V[];
 };
 
-layout(std430, binding = 2) buffer MassBuffer {
-    double M[];
+layout(std430, binding = 2) buffer InverseMassBuffer {
+    double W[];
 };
 
 layout(std430, binding = 3) buffer LengthBuffer {
@@ -30,11 +33,12 @@ uniform double h;
 uniform double gravity;
 uniform uint substeps;
 
-shared dvec2 P[50];
+shared dvec2 P[MAX_SEGMENTS];
 
+// distance constraint
 void solveConstraint(uint segmentId, uint gIndex){
-    double w1 = segmentId == 0 ? 0.0 : 1.0 / M[gIndex - 1];
-    double w2 = 1.0 / M[gIndex];
+    double w1 = segmentId == 0 ? 0.0 : W[gIndex - 1];
+    double w2 = W[gIndex];
 
     dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId];
     dvec2 n = normalize(s);
@@ -48,38 +52,35 @@ void solveConstraint(uint segmentId, uint gIndex){
     P[segmentId] += deltaP2;
 }
 
+// No need for synchronization, because each pendulum gets its own warp
 void main() {
     uint pendulumId = gl_WorkGroupID.x;
     uint i_off = indices[pendulumId];
-    uint count = segmentCounts[pendulumId];
+    uint segmentCount = segmentCounts[pendulumId];
     uint localId = gl_LocalInvocationID.x;
-    uint index = i_off + localId;
+    uint gIndex = i_off + localId;
 
-    if (localId >= count)
+    // discard unneeded local threads for segments
+    if (localId >= segmentCount)
         return;
 
+    // substepping is not parallelizable
     for (int k = 0; k < substeps; k++){
 
         // explicit integration with gravity as external force
-        V[index] = V[index] + dvec2(0, -gravity * h);
-        P[localId] = X[index] + V[index] * h;
-
-        memoryBarrierShared();
+        V[gIndex] = V[gIndex] + dvec2(0, -gravity * h);
+        P[localId] = X[gIndex] + V[gIndex] * h;
 
+        // every segment depends on its neighbour, so we can not parallelize all constraints
+        // only parallelize those, who are independent of each other, skipping every second segment
         if (localId % 2 == 0){
             solveConstraint(localId, i_off + localId);
-            if (localId + 1 < count){
-                memoryBarrierShared();
-                solveConstraint(localId + 1, i_off + localId + 1);
-            }
+            if (localId + 1 < segmentCount)
+                solveConstraint(localId + 1, gIndex + 1);
         }
 
-        barrier();
-
         // update position and velocity
-        V[index] = (P[localId] - X[index]) / h;
-        X[index] = P[localId];
-
-        memoryBarrierBuffer();
+        V[gIndex] = (P[localId] - X[gIndex]) / h;
+        X[gIndex] = P[localId];
     }
 }
diff --git a/src/MainWindow.cpp b/src/MainWindow.cpp
index ab7b54c..39356e2 100644
--- a/src/MainWindow.cpp
+++ b/src/MainWindow.cpp
@@ -380,6 +380,20 @@ QWidget * MainWindow::buildSimulationUI() {
 		lyt->addWidget(showMasses);
 	}
 
+	// GPU-Acceleration
+	{
+		auto useGPU = new QCheckBox("Use GPU-Acceleration");
+		connect(useGPU, &QCheckBox::stateChanged, simulation, &Simulation::useGPUChanged);
+		connect(simulation, &Simulation::gpuNotSupported, [useGPU](const std::string &message){
+			useGPU->setCheckState(Qt::Unchecked);
+			useGPU->setEnabled(false);
+			useGPU->setToolTip(QString::fromStdString(message));
+		});
+		useGPU->setCheckState(Qt::Unchecked);
+
+		lyt->addWidget(useGPU);
+	}
+
 	return w;
 }
 
diff --git a/src/MainWindow.h b/src/MainWindow.h
index c69fcdd..0871550 100644
--- a/src/MainWindow.h
+++ b/src/MainWindow.h
@@ -24,7 +24,7 @@ private:
 
 	GLWidget * glWidget = nullptr;
 
-	static const int MaxSegments = 50;
+	static const int MaxSegments = 32;
 
 	std::vector<double> masses, lengths;
 	int segments = 0;
diff --git a/src/Simulation.cpp b/src/Simulation.cpp
index 96a4f5e..9d77b9d 100644
--- a/src/Simulation.cpp
+++ b/src/Simulation.cpp
@@ -32,7 +32,10 @@ void Simulation::initialize() {
 		context->makeCurrent(surface);
 		initializeOpenGLFunctions();
 		if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){
-			printf("Double precision not supported by OpenGL!\n");
+			std::string message = "Double precision not supported by OpenGL! Falling back to CPU-Multithreading.";
+			printf("%s\n", message.c_str());
+			useGPUAcceleration = false;
+			emit gpuNotSupported(message);
 		}
 	}
 
@@ -58,7 +61,7 @@ void Simulation::initialize() {
 		glGenBuffers(6, &SSBO.positions);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities);
-		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.masses);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.invMasses);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices);
 		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts);
@@ -70,7 +73,7 @@ void Simulation::initialize() {
 void Simulation::updateGPUData() {
 	std::vector<GLdouble> positions;
 	std::vector<GLdouble> velocities;
-	std::vector<GLdouble> masses;
+	std::vector<GLdouble> invMasses;
 	std::vector<GLdouble> lengths;
 	std::vector<GLuint> indices;
 	std::vector<GLuint> segmentCounts;
@@ -81,18 +84,18 @@ void Simulation::updateGPUData() {
 		currentIndex += p->X.size();
 		segmentCounts.push_back(p->X.size());
 		for (size_t i = 0; i < p->X.size(); i++){
-			positions.push_back(GLdouble(p->X[i].x));
-			positions.push_back(GLdouble(p->X[i].y));
-			velocities.push_back(GLdouble(p->V[i].x));
-			velocities.push_back(GLdouble(p->V[i].y));
-			masses.push_back(GLfloat(p->M[i]));
-			lengths.push_back(GLfloat(p->L[i]));
+			positions.push_back(p->X[i].x);
+			positions.push_back(p->X[i].y);
+			velocities.push_back(p->V[i].x);
+			velocities.push_back(p->V[i].y);
+			invMasses.push_back(1.0 / p->M[i]);
+			lengths.push_back(p->L[i]);
 		}
 	}
 
 	auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble));
 	auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble));
-	auto massSize = GLsizeiptr(masses.size() * sizeof(GLdouble));
+	auto invMassSize = GLsizeiptr(invMasses.size() * sizeof(GLdouble));
 	auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble));
 	auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint));
 	auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint));
@@ -101,8 +104,8 @@ void Simulation::updateGPUData() {
 	glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW);
 	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
 	glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW);
-	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.masses);
-	glBufferData(GL_SHADER_STORAGE_BUFFER, massSize, masses.data(), GL_STATIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.invMasses);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, invMassSize, invMasses.data(), GL_STATIC_DRAW);
 	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths);
 	glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW);
 	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices);
@@ -125,33 +128,12 @@ void Simulation::update() {
 	double newPotentialEnergy = 0;
 	double newKineticEnergy = 0;
 
-	auto t1 = system_clock::now();
-
-	/*
-	// OLD with CPUs
-	{
-		#pragma omp parallel for
-		for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) // not ranged based for msvc
-			for (int k = 0; k < substeps; k++)
-				pendula[i]->update(h, gravity);
-			double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
-			double localKineticEnergy = pendula[i]->kineticEnergy();
-			#pragma omp atomic
-			newPotentialEnergy += localPotentialEnergy;
-			#pragma omp atomic
-			newKineticEnergy += localKineticEnergy;
-		}
-	}*/
-
-
-	// NEW with GPUs
-	{
+	if (useGPUAcceleration) {
 		program->bind();
 		glUniform1d(program->uniformLocation("h"), h);
 		glUniform1d(program->uniformLocation("gravity"), gravity);
 		glUniform1ui(program->uniformLocation("substeps"), substeps);
 
-
 		glDispatchCompute(pendula.size(), 1, 1);
 		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
@@ -168,6 +150,7 @@ void Simulation::update() {
 			}
 			glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
 		}
+
 		// Read updated velocities
 		{
 			glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
@@ -181,10 +164,31 @@ void Simulation::update() {
 			}
 			glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
 		}
-	}
 
-	std::cout << duration_cast<microseconds>(system_clock::now() - t1).count() << std::endl;
+		#pragma omp parallel for
+		for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
+			double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
+			double localKineticEnergy = pendula[i]->kineticEnergy();
+			#pragma omp atomic
+			newPotentialEnergy += localPotentialEnergy;
+			#pragma omp atomic
+			newKineticEnergy += localKineticEnergy;
+		}
 
+	} else {
+		// CPU-Multithreading
+		#pragma omp parallel for
+		for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
+			for (int k = 0; k < substeps; k++)
+				pendula[i]->update(h, gravity);
+			double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
+			double localKineticEnergy = pendula[i]->kineticEnergy();
+			#pragma omp atomic
+			newPotentialEnergy += localPotentialEnergy;
+			#pragma omp atomic
+			newKineticEnergy += localKineticEnergy;
+		}
+	}
 
 	potentialEnergy = newPotentialEnergy;
 	kineticEnergy = newKineticEnergy;
@@ -229,3 +233,9 @@ void Simulation::updateEnergy() {
 	}
 }
 
+void Simulation::useGPUChanged(int state) {
+	useGPUAcceleration = state == Qt::Checked;
+	if (useGPUAcceleration)
+		updateGPUData();
+}
+
diff --git a/src/Simulation.h b/src/Simulation.h
index 6222ad7..6af7bbe 100644
--- a/src/Simulation.h
+++ b/src/Simulation.h
@@ -32,11 +32,13 @@ public:
 	void initialize();
 	void updateEnergy();
 signals:
+	void gpuNotSupported(const std::string &message);
 	void layoutChanged();
 	void positionChanged();
 public slots:
 	void clearPendula();
 	void addPendula(const std::vector<Pendulum *> &add);
+	void useGPUChanged(int);
 private slots:
 	void update();
 private:
@@ -50,11 +52,12 @@ private:
 	struct SSBO {
 		GLuint positions;
 		GLuint velocities;
-		GLuint masses;
+		GLuint invMasses;
 		GLuint lengths;
 		GLuint indices;
 		GLuint segmentCounts;
 	} SSBO;
 	QTimer * timer;
 	int updateInterval = 16;
+	bool useGPUAcceleration = false;
 };
\ No newline at end of file