gpu acceleration more efficient

added gpu simulation
optimize includes
9 changed files with 283 additions and 29 deletions
--- a/shaders/compute.glsl
+++ b/shaders/compute.glsl
@ -0,0 +1,86 @@
+#version 430
+#extension GL_ARB_gpu_shader_fp64 : enable
+
+#define MAX_SEGMENTS 32
+
+layout (local_size_x=MAX_SEGMENTS, local_size_y=1, local_size_z=1) in;
+
+layout(std430, binding = 0) buffer PositionBuffer {
+    dvec2 X[];
+};
+
+layout(std430, binding = 1) buffer VelocityBuffer {
+    dvec2 V[];
+};
+
+layout(std430, binding = 2) buffer InverseMassBuffer {
+    double W[];
+};
+
+layout(std430, binding = 3) buffer LengthBuffer {
+    double L[];
+};
+
+layout(std430, binding = 4) buffer IndexBuffer {
+    uint indices[];
+};
+
+layout(std430, binding = 5) buffer SegmentCountBuffer {
+    uint segmentCounts[];
+};
+
+uniform double h;
+uniform double gravity;
+uniform uint substeps;
+
+shared dvec2 P[MAX_SEGMENTS];
+
+// distance constraint
+void solveConstraint(uint segmentId, uint gIndex){
+    double w1 = segmentId == 0 ? 0.0 : W[gIndex - 1];
+    double w2 = W[gIndex];
+
+    dvec2 s = (segmentId == 0 ? dvec2(0, 0) : P[segmentId - 1]) - P[segmentId];
+    dvec2 n = normalize(s);
+    double l = length(s);
+
+    dvec2 deltaP1 = n * -w1 / (w1 + w2) * (l - L[gIndex]);
+    dvec2 deltaP2 = n *  w2 / (w1 + w2) * (l - L[gIndex]);
+
+    if (segmentId > 0)
+        P[segmentId - 1] += deltaP1;
+    P[segmentId] += deltaP2;
+}
+
+// No need for synchronization, because each pendulum gets its own warp
+void main() {
+    uint pendulumId = gl_WorkGroupID.x;
+    uint i_off = indices[pendulumId];
+    uint segmentCount = segmentCounts[pendulumId];
+    uint localId = gl_LocalInvocationID.x;
+    uint gIndex = i_off + localId;
+
+    // discard unneeded local threads for segments
+    if (localId >= segmentCount)
+        return;
+
+    // substepping is not parallelizable
+    for (int k = 0; k < substeps; k++){
+
+        // explicit integration with gravity as external force
+        V[gIndex] = V[gIndex] + dvec2(0, -gravity * h);
+        P[localId] = X[gIndex] + V[gIndex] * h;
+
+        // every segment depends on its neighbour, so we can not parallelize all constraints
+        // only parallelize those, who are independent of each other, skipping every second segment
+        if (localId % 2 == 0){
+            solveConstraint(localId, i_off + localId);
+            if (localId + 1 < segmentCount)
+                solveConstraint(localId + 1, gIndex + 1);
+        }
+
+        // update position and velocity
+        V[gIndex] = (P[localId] - X[gIndex]) / h;
+        X[gIndex] = P[localId];
+    }
+}
--- a/shaders/shaders.qrc
+++ b/shaders/shaders.qrc
@ -4,5 +4,6 @@
        <file>fragment_pendula.glsl</file>
        <file>vertex_quad.glsl</file>
        <file>fragment_quad.glsl</file>
+        <file>compute.glsl</file>
    </qresource>
 </RCC>
--- a/src/GLWidget.cpp
+++ b/src/GLWidget.cpp
@ -9,6 +9,7 @@
 #include "Overlay.h"
 #include <QOpenGLShaderProgram>
 #include "FPS.h"
+#include "Pendulum.h"

 GLWidget::GLWidget(Simulation * simulation) : simulation(simulation) {
 	startTimer(1000 / 144);
@ -21,7 +22,7 @@ GLWidget::GLWidget(Simulation * simulation) : simulation(simulation) {
 void GLWidget::initializeGL() {
 	initializeOpenGLFunctions();

-	std::cout << "OpenGL Version: " << format().majorVersion() << "." << format().minorVersion() << std::endl;
+	printf("OpenGL Version: %d.%d\n", format().majorVersion(), format().minorVersion());

 	pendulaProgram = new QOpenGLShaderProgram;
 	pendulaProgram->addShaderFromSourceFile(QOpenGLShader::Vertex, ":/shaders/vertex_pendula.glsl");
@ -186,6 +187,7 @@ void GLWidget::uploadStaticDataToGPU() {
 }

 void GLWidget::changePosition() {
+	auto t1 = system_clock::now();
 	glBindBuffer(GL_ARRAY_BUFFER, positionVBO);
 	auto positions = (GLfloat*) glMapBufferRange(GL_ARRAY_BUFFER, 0, GLsizeiptr(positionCount * sizeof(float)), GL_MAP_WRITE_BIT);
 	size_t index = 0;
@ -198,6 +200,7 @@ void GLWidget::changePosition() {
 		}
 	}
 	glUnmapBuffer(GL_ARRAY_BUFFER);
+	// std::cout << duration_cast<microseconds>(system_clock::now() - t1).count() << std::endl;
 	simulation->semaphore.release();
 }

--- a/src/MainWindow.cpp
+++ b/src/MainWindow.cpp
@ -17,13 +17,14 @@
 #include <QCloseEvent>
 #include <QTimer>
 #include "Overlay.h"
+#include "Slider.h"

 MainWindow::MainWindow() {
 	simulationThread = new QThread(this);
+	simulationThread->setObjectName("Simulation");
 	simulation = new Simulation;
 	simulation->moveToThread(simulationThread);
 	simulationThread->start();
-	QMetaObject::invokeMethod(simulation, &Simulation::initialize);

 	masses = std::vector<double>(MaxSegments);
 	lengths = std::vector<double>(MaxSegments);
@ -34,6 +35,8 @@ MainWindow::MainWindow() {
 	buildUI();

 	normalizeLengths();
+
+	QMetaObject::invokeMethod(simulation, &Simulation::initialize);
 }

 void MainWindow::buildUI() {
@ -377,6 +380,20 @@ QWidget * MainWindow::buildSimulationUI() {
 		lyt->addWidget(showMasses);
 	}

+	// GPU-Acceleration
+	{
+		auto useGPU = new QCheckBox("Use GPU-Acceleration");
+		connect(useGPU, &QCheckBox::stateChanged, simulation, &Simulation::useGPUChanged);
+		connect(simulation, &Simulation::gpuNotSupported, [useGPU](const std::string &message){
+			useGPU->setCheckState(Qt::Unchecked);
+			useGPU->setEnabled(false);
+			useGPU->setToolTip(QString::fromStdString(message));
+		});
+		useGPU->setCheckState(Qt::Unchecked);
+
+		lyt->addWidget(useGPU);
+	}
+
 	return w;
 }

--- a/src/MainWindow.h
+++ b/src/MainWindow.h
@ -1,11 +1,12 @@
 #include <QWidget>
 #include <vector>
-#include "Slider.h"
 #include "Pendulum.h"

 class Simulation;
 class QGridLayout;
 class GLWidget;
+template<typename T>
+		class Slider;

 enum Property {Angle, Mass, Length};

@ -23,7 +24,7 @@ private:

 	GLWidget * glWidget = nullptr;

-	static const int MaxSegments = 50;
+	static const int MaxSegments = 32;

 	std::vector<double> masses, lengths;
 	int segments = 0;
@ -38,7 +39,7 @@ private:

 	QGridLayout * segmentGrid {};
 	Slider<double> * gravitySlider = nullptr, * timescaleSlider = nullptr;
-	Slider<> * substepsSlider = nullptr;
+	Slider<int> * substepsSlider = nullptr;

 signals:
 	void pendulaCreated(const std::vector<Pendulum*> &add);
--- a/src/Pendulum.h
+++ b/src/Pendulum.h
@ -4,7 +4,8 @@
 #include <QVector2D>
 #include <QColor>
 #include "Vector.h"
-#include "GLWidget.h"
+
+class GLWidget;

 class Pendulum {
 public:
@ -16,6 +17,7 @@ public:
 	double kineticEnergy() const;
 private:
 	friend class GLWidget;
+	friend class Simulation;
 	std::vector<Vector> X, V;
 	std::vector<double> M, L;
 	QColor color;
--- a/src/Simulation.cpp
+++ b/src/Simulation.cpp
@ -5,6 +5,7 @@
 #include <omp.h>
 #include <iostream>
 #include "FPS.h"
+#include "Pendulum.h"
 #include <QOpenGLContext>
 #include <QOpenGLShaderProgram>
 #include <QOffscreenSurface>
@ -12,7 +13,6 @@
 Simulation::Simulation() {
 	ups = new FPS;
 	ups->setUpdateInterval(100);
-
 	timer = new QTimer(this);
 	QTimer::connect(timer, &QTimer::timeout, this, &Simulation::update);
 	timer->setInterval(updateInterval);
@ -31,24 +31,90 @@ void Simulation::initialize() {
 		surface->create();
 		context->makeCurrent(surface);
 		initializeOpenGLFunctions();
+		if (!context->hasExtension("GL_ARB_gpu_shader_fp64")){
+			std::string message = "Double precision not supported by OpenGL! Falling back to CPU-Multithreading.";
+			printf("%s\n", message.c_str());
+			useGPUAcceleration = false;
+			emit gpuNotSupported(message);
+		}
 	}

-	int maxWorkGroupCount[3];
-	int maxWorkGroupSize[3];
-	int maxWorkGroupInvocations;
-	for (int i = 0; i < 3; i++){
-		glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, i, maxWorkGroupCount + i);
-		glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, i, maxWorkGroupSize + i);
+	// Read GPU Limits
+	{
+		for (int i = 0; i < 3; i++){
+			glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_COUNT, i, gpuLimits.maxWGCount + i);
+			glGetIntegeri_v(GL_MAX_COMPUTE_WORK_GROUP_SIZE, i, gpuLimits.maxWGSize + i);
+		}
+		glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &gpuLimits.maxWGInvocations);
+
+		printf("Max work group count: (%d, %d, %d)\n", gpuLimits.maxWGCount[0], gpuLimits.maxWGCount[1], gpuLimits.maxWGCount[2]);
+		printf("Max work group size: (%d, %d, %d)\n", gpuLimits.maxWGSize[0], gpuLimits.maxWGSize[1], gpuLimits.maxWGSize[2]);
+		printf("Max work group invocations (x * y * z): %d\n", gpuLimits.maxWGInvocations);
 	}
-	glGetIntegerv(GL_MAX_COMPUTE_WORK_GROUP_INVOCATIONS, &maxWorkGroupInvocations);

+	// Shader
+	{
+		program = new QOpenGLShaderProgram;
+		program->addShaderFromSourceFile(QOpenGLShader::Compute, ":/shaders/compute.glsl");
+		program->link();
+
+		glGenBuffers(6, &SSBO.positions);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, SSBO.positions);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, SSBO.velocities);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, SSBO.invMasses);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, SSBO.lengths);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, SSBO.indices);
+		glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, SSBO.segmentCounts);
+	}

-	printf("Max work group count: (%d, %d, %d)\n", maxWorkGroupCount[0], maxWorkGroupCount[1], maxWorkGroupCount[2]);
-	printf("Max work group size: (%d, %d, %d)\n", maxWorkGroupSize[0], maxWorkGroupSize[1], maxWorkGroupSize[2]);
-	printf("Max work group invocations (x * y * z): %d\n", maxWorkGroupInvocations);
+}

+// When the layout changes
+void Simulation::updateGPUData() {
+	std::vector<GLdouble> positions;
+	std::vector<GLdouble> velocities;
+	std::vector<GLdouble> invMasses;
+	std::vector<GLdouble> lengths;
+	std::vector<GLuint> indices;
+	std::vector<GLuint> segmentCounts;
+
+	GLuint currentIndex = 0;
+	for (const Pendulum * p : pendula){
+		indices.push_back(currentIndex);
+		currentIndex += p->X.size();
+		segmentCounts.push_back(p->X.size());
+		for (size_t i = 0; i < p->X.size(); i++){
+			positions.push_back(p->X[i].x);
+			positions.push_back(p->X[i].y);
+			velocities.push_back(p->V[i].x);
+			velocities.push_back(p->V[i].y);
+			invMasses.push_back(1.0 / p->M[i]);
+			lengths.push_back(p->L[i]);
+		}
+	}
+
+	auto posSize = GLsizeiptr(positions.size() * sizeof(GLdouble));
+	auto velSize = GLsizeiptr(velocities.size() * sizeof(GLdouble));
+	auto invMassSize = GLsizeiptr(invMasses.size() * sizeof(GLdouble));
+	auto lengthSize = GLsizeiptr(lengths.size() * sizeof(GLdouble));
+	auto indicesSize = GLsizeiptr(indices.size() * sizeof(GLuint));
+	auto segmentCountsSize = GLsizeiptr(segmentCounts.size() * sizeof(GLuint));
+
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.positions);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, posSize, positions.data(), GL_DYNAMIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, velSize, velocities.data(), GL_DYNAMIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.invMasses);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, invMassSize, invMasses.data(), GL_STATIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.lengths);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, lengthSize, lengths.data(), GL_STATIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.indices);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, indicesSize, indices.data(), GL_STATIC_DRAW);
+	glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.segmentCounts);
+	glBufferData(GL_SHADER_STORAGE_BUFFER, segmentCountsSize, segmentCounts.data(), GL_STATIC_DRAW);
 }

+// Every few milliseconds
 void Simulation::update() {
 	if (!isPlaying)
 		return;
@ -62,16 +128,66 @@ void Simulation::update() {
 	double newPotentialEnergy = 0;
 	double newKineticEnergy = 0;

-	#pragma omp parallel for
-	for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) // not ranged based for msvc
-		for (int k = 0; k < substeps; k++)
-			pendula[i]->update(h, gravity);
-		double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
-		double localKineticEnergy = pendula[i]->kineticEnergy();
-		#pragma omp atomic
-		newPotentialEnergy += localPotentialEnergy;
-		#pragma omp atomic
-		newKineticEnergy += localKineticEnergy;
+	if (useGPUAcceleration) {
+		program->bind();
+		glUniform1d(program->uniformLocation("h"), h);
+		glUniform1d(program->uniformLocation("gravity"), gravity);
+		glUniform1ui(program->uniformLocation("substeps"), substeps);
+
+		glDispatchCompute(pendula.size(), 1, 1);
+		glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
+
+		// Read updated positions
+		{
+			glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.positions);
+			auto * newPositions = (double*) glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
+			int index = 0;
+			for (Pendulum * p : pendula){
+				for (Vector &point : p->X){
+					point.x = newPositions[index++];
+					point.y = newPositions[index++];
+				}
+			}
+			glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+		}
+
+		// Read updated velocities
+		{
+			glBindBuffer(GL_SHADER_STORAGE_BUFFER, SSBO.velocities);
+			auto * newVelocities = (double*) glMapBuffer(GL_SHADER_STORAGE_BUFFER, GL_READ_ONLY);
+			int index = 0;
+			for (Pendulum * p : pendula){
+				for (Vector &velocity : p->V){
+					velocity.x = newVelocities[index++];
+					velocity.y = newVelocities[index++];
+				}
+			}
+			glUnmapBuffer(GL_SHADER_STORAGE_BUFFER);
+		}
+
+		#pragma omp parallel for
+		for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
+			double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
+			double localKineticEnergy = pendula[i]->kineticEnergy();
+			#pragma omp atomic
+			newPotentialEnergy += localPotentialEnergy;
+			#pragma omp atomic
+			newKineticEnergy += localKineticEnergy;
+		}
+
+	} else {
+		// CPU-Multithreading
+		#pragma omp parallel for
+		for (int i = 0; i < pendula.size(); i++){ // NOLINT(*-loop-convert) not ranged based for msvc
+			for (int k = 0; k < substeps; k++)
+				pendula[i]->update(h, gravity);
+			double localPotentialEnergy = pendula[i]->potentialEnergy(gravity);
+			double localKineticEnergy = pendula[i]->kineticEnergy();
+			#pragma omp atomic
+			newPotentialEnergy += localPotentialEnergy;
+			#pragma omp atomic
+			newKineticEnergy += localKineticEnergy;
+		}
 	}

 	potentialEnergy = newPotentialEnergy;
@ -92,6 +208,7 @@ void Simulation::clearPendula() {
 	pendula.shrink_to_fit();

 	updateEnergy();
+	updateGPUData();

 	emit layoutChanged();
 }
@ -103,6 +220,7 @@ void Simulation::addPendula(const std::vector<Pendulum *> &add) {
 		pendula.push_back(p);

 	updateEnergy();
+	updateGPUData();

 	emit layoutChanged();
 }
@ -115,3 +233,9 @@ void Simulation::updateEnergy() {
 	}
 }

+void Simulation::useGPUChanged(int state) {
+	useGPUAcceleration = state == Qt::Checked;
+	if (useGPUAcceleration)
+		updateGPUData();
+}
+
--- a/src/Simulation.h
+++ b/src/Simulation.h
@ -2,12 +2,12 @@
 #include <cstdint>
 #include <QObject>
 #include <chrono>
-#include "Pendulum.h"
 #include <semaphore>
 #include <QOpenGLFunctions_4_3_Core>

-class QTimer;
 class FPS;
+class Pendulum;
+class QOpenGLShaderProgram;

 class Simulation : public QObject, protected QOpenGLFunctions_4_3_Core {
 	Q_OBJECT
@ -32,14 +32,32 @@ public:
 	void initialize();
 	void updateEnergy();
 signals:
+	void gpuNotSupported(const std::string &message);
 	void layoutChanged();
 	void positionChanged();
 public slots:
 	void clearPendula();
 	void addPendula(const std::vector<Pendulum *> &add);
+	void useGPUChanged(int);
 private slots:
 	void update();
 private:
+	void updateGPUData();
+	struct GPULimits {
+		int maxWGCount[3];
+		int maxWGSize[3];
+		int maxWGInvocations;
+	} gpuLimits;
+	QOpenGLShaderProgram * program;
+	struct SSBO {
+		GLuint positions;
+		GLuint velocities;
+		GLuint invMasses;
+		GLuint lengths;
+		GLuint indices;
+		GLuint segmentCounts;
+	} SSBO;
 	QTimer * timer;
 	int updateInterval = 16;
+	bool useGPUAcceleration = false;
 };
--- a/src/Slider.h
+++ b/src/Slider.h
@ -1,3 +1,5 @@
+#pragma once
+
 #include <QSlider>
 #include <functional>
 #include <QLabel>
Author	SHA1	Message	Date
Benjamin Kraft	05baa0db8a	gpu acceleration more efficient	1 year ago
Benjamin Kraft	08cabd9d8e	added gpu simulation	1 year ago
Benjamin Kraft	df169be295	optimize includes	1 year ago