parallel101 · Uuq114 · Feb 25, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 build
 GNUmakefile
+bench.txt
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,3 +7,4 @@ if (NOT CMAKE_BUILD_TYPE)
 endif()
 
 add_executable(main main.cpp)
+target_compile_options(main PUBLIC -ffast-math -march=native)
diff --git a/main.cpp b/main.cpp
@@ -1,73 +1,121 @@
+#include <chrono>
+#include <cmath>
 #include <cstdio>
 #include <cstdlib>
+#include <tuple>
 #include <vector>
-#include <chrono>
-#include <cmath>
 
-float frand() {
-    return (float)rand() / RAND_MAX * 2 - 1;
+#define VECTOR_SIZE 48
+
+/* static float frand() {
+    float INV_RAND_MAX = 1.0 / RAND_MAX;
+    return (float)rand() * (INV_RAND_MAX * 2) - 1;
+} */
+static std::tuple<float, float, float, float, float, float, float> frand7() {
+    float INV_RAND_MAX = 1.0 / RAND_MAX;
+    float a = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float b = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float c = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float d = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float e = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float f = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    float g = (float)rand() * (INV_RAND_MAX * 2) - 1;
+    return std::make_tuple(a, b, c, d, e, f, g);
 }
 
-struct Star {
-    float px, py, pz;
-    float vx, vy, vz;
-    float mass;
+struct alignas(32) Star {
+    // float px, py, pz;
+    // float vx, vy, vz;
+    // float mass;
+    float px[VECTOR_SIZE];
+    float py[VECTOR_SIZE];
+    float pz[VECTOR_SIZE];
+    float vx[VECTOR_SIZE];
+    float vy[VECTOR_SIZE];
+    float vz[VECTOR_SIZE];
+    float mass[VECTOR_SIZE];
 };
 
-std::vector<Star> stars;
+// std::vector<Star> stars;
+Star stars;
 
-void init() {
-    for (int i = 0; i < 48; i++) {
-        stars.push_back({
-            frand(), frand(), frand(),
-            frand(), frand(), frand(),
-            frand() + 1,
-        });
+static void init() {
+#pragma GCC unroll 8
+    for (size_t i = 0; i < VECTOR_SIZE; i++) {
+        // stars.push_back({
+        //     frand(), frand(), frand(),
+        //     frand(), frand(), frand(),
+        //     frand() + 1,
+        // });
+        // stars.px.emplace_back(frand());
+        // stars.py.emplace_back(frand());
+        // stars.pz.emplace_back(frand());
+        // stars.vx.emplace_back(frand());
+        // stars.vy.emplace_back(frand());
+        // stars.vz.emplace_back(frand());
+        // stars.mass.emplace_back(frand() + 1);
+        auto [a, b, c, d, e, f, g] = frand7();
+        stars.px[i] = a;
+        stars.py[i] = b;
+        stars.pz[i] = c;
+        stars.vx[i] = d;
+        stars.vy[i] = e;
+        stars.vz[i] = f;
+        stars.mass[i] = g + 1;
     }
 }
 
-float G = 0.001;
-float eps = 0.001;
-float dt = 0.01;
+constexpr float G = 0.001;
+constexpr float eps = 0.001;
+constexpr float dt = 0.01;
+constexpr float eps_times_eps = eps * eps;  // eps * eps
 
-void step() {
-    for (auto &star: stars) {
-        for (auto &other: stars) {
-            float dx = other.px - star.px;
-            float dy = other.py - star.py;
-            float dz = other.pz - star.pz;
-            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
-            d2 *= sqrt(d2);
-            star.vx += dx * other.mass * G * dt / d2;
-            star.vy += dy * other.mass * G * dt / d2;
-            star.vz += dz * other.mass * G * dt / d2;
+static void step() {
+#pragma GCC ivdep
+    for (int i = 0; i < VECTOR_SIZE; ++i) {
+        for (int j = 0; j < VECTOR_SIZE; ++j) {
+            if (i == j)
+                continue;
+            float dx = stars.px[j] - stars.px[i];
+            float dy = stars.py[j] - stars.py[i];
+            float dz = stars.pz[j] - stars.pz[i];
+            float d2 = dx * dx + dy * dy + dz * dz + eps_times_eps;
+            d2 *= std::sqrt(d2);
+            float inv_d2 = 1.0 / d2;
+            float tmp = stars.mass[j] * (G * dt) * inv_d2;
+            stars.vx[i] += dx * tmp;
+            stars.vy[i] += dy * tmp;
+            stars.vz[i] += dz * tmp;
         }
     }
-    for (auto &star: stars) {
-        star.px += star.vx * dt;
-        star.py += star.vy * dt;
-        star.pz += star.vz * dt;
+#pragma GCC unroll 16
+    for (int i = 0; i < VECTOR_SIZE; ++i) {
+        stars.px[i] += stars.vx[i] * dt;
+        stars.py[i] += stars.vy[i] * dt;
+        stars.pz[i] += stars.vz[i] * dt;
     }
 }
 
-float calc() {
+static float calc() {
     float energy = 0;
-    for (auto &star: stars) {
-        float v2 = star.vx * star.vx + star.vy * star.vy + star.vz * star.vz;
-        energy += star.mass * v2 / 2;
-        for (auto &other: stars) {
-            float dx = other.px - star.px;
-            float dy = other.py - star.py;
-            float dz = other.pz - star.pz;
-            float d2 = dx * dx + dy * dy + dz * dz + eps * eps;
-            energy -= other.mass * star.mass * G / sqrt(d2) / 2;
+#pragma GCC ivdep
+    for (int i = 0; i < VECTOR_SIZE; ++i) {
+        float v2 = stars.vx[i] * stars.vx[i] + stars.vy[i] * stars.vy[i] + stars.vz[i] * stars.vz[i];
+        energy += stars.mass[i] * v2 / 2;
+        for (int j = 0; j < VECTOR_SIZE; ++j) {
+            float dx = stars.px[j] - stars.px[i];
+            float dy = stars.py[j] - stars.py[i];
+            float dz = stars.pz[j] - stars.pz[i];
+            float d2 = dx * dx + dy * dy + dz * dz + eps_times_eps;
+            float inv_sqrt_d2 = 1 / std::sqrt(d2);
+            energy -= stars.mass[j] * stars.mass[i] * (G * 0.5f) * inv_sqrt_d2;
         }
     }
     return energy;
 }
 
 template <class Func>
-long benchmark(Func const &func) {
+static long benchmark(Func const& func) {
     auto t0 = std::chrono::steady_clock::now();
     func();
     auto t1 = std::chrono::steady_clock::now();
@@ -79,7 +127,7 @@ int main() {
     init();
     printf("Initial energy: %f\n", calc());
     auto dt = benchmark([&] {
-        for (int i = 0; i < 100000; i++)
+        for (size_t i = 0; i < 100000; i++)
             step();
     });
     printf("Final energy: %f\n", calc());