-
Notifications
You must be signed in to change notification settings - Fork 1
/
nbody_cuda.cu
171 lines (131 loc) · 4.7 KB
/
nbody_cuda.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cuda.h>
#include <cuda_runtime.h>
#define dT 0.2f
#define G 0.6f
#define BLOCK_SIZE 64
// Global variables
int num_planets;
int num_timesteps;
// Host arrays
float2* velocities;
float4* planets;
// Device arrays
float2* velocities_d;
float4* planets_d;
// Parse command line arguments
void parse_args(int argc, char** argv){
if(argc != 2){
printf("Useage: nbody num_timesteps\n");
exit(-1);
}
num_timesteps = strtol(argv[1], 0, 10);
}
// Reads planets from planets.txt
void read_planets(){
FILE* file = fopen("planets.txt", "r");
if(file == NULL){
printf("'planets.txt' not found. Exiting\n");
exit(-1);
}
char line[200];
fgets(line, 200, file);
sscanf(line, "%d", &num_planets);
planets = (float4*)malloc(sizeof(float4)*num_planets);
velocities = (float2*)malloc(sizeof(float2)*num_planets);
for(int p = 0; p < num_planets; p++){
fgets(line, 200, file);
sscanf(line, "%f %f %f %f %f",
&planets[p].x,
&planets[p].y,
&velocities[p].x,
&velocities[p].y,
&planets[p].z);
}
fclose(file);
}
// Writes planets to file
void write_planets(int timestep){
char name[20];
int n = sprintf(name, "planets_out.txt");
FILE* file = fopen(name, "wr+");
for(int p = 0; p < num_planets; p++){
fprintf(file, "%f %f %f %f %f\n",
planets[p].x,
planets[p].y,
velocities[p].x,
velocities[p].y,
planets[p].z);
}
fclose(file);
}
// TODO 7. Calculate the change in velocity for p, caused by the interaction with q
__device__ float2 calculate_velocity_change_planet(float4 p, float4 q){
float2 r;
r.x = q.x - p.x;
r.y = q.y - p.y;
if(r.x == 0 && r.y == 0) {
float2 v = {0.0f, 0.0f};
return v;
}
float abs_dist = sqrt(r.x*r.x + r.y*r.y);
float dist_cubed = abs_dist*abs_dist*abs_dist;
float2 dv;
dv.x = dT*G*q.z/dist_cubed * r.x;
dv.y = dT*G*q.z/dist_cubed * r.y;
return dv;
}
// TODO 5. Calculate the change in velocity for my_planet, caused by the interactions with a block of planets
__device__ float2 calculate_velocity_change_block(float4 my_planet, float4* shared_planets){
float2 velocity = {0.0f, 0.0f};
for(int i = 0; i < blockDim.x; i++) {
float2 tempv = calculate_velocity_change_planet(my_planet, shared_planets[i]);
velocity.x += tempv.x;
velocity.y += tempv.y;
}
return velocity;
}
// TODO 4. Update the velocities by calculating the planet interactions
__global__ void update_velocities(float4* planets, float2* velocities, int num_planets){
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
float4 my_planet = planets[thread_id];
__shared__ float4 shared_planets[BLOCK_SIZE];
for(int i = 0; i < num_planets; i += blockDim.x) {
shared_planets[threadIdx.x] = planets[i + threadIdx.x];
__syncthreads();
float2 tempv = calculate_velocity_change_block(my_planet, shared_planets);
velocities[thread_id].x += tempv.x;
velocities[thread_id].y += tempv.y;
__syncthreads();
}
}
// TODO 7. Update the positions of the planets using the new velocities
__global__ void update_positions(float4* planets, float2* velocities, int num_planets){
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
planets[thread_id].x += velocities[thread_id].x * dT;
planets[thread_id].y += velocities[thread_id].y * dT;
}
int main(int argc, char** argv){
parse_args(argc, argv);
read_planets();
// TODO 1. Allocate device memory, and transfer data to device
cudaMalloc(&planets_d, sizeof(float4)*num_planets);
cudaMalloc(&velocities_d, sizeof(float2)*num_planets);
cudaMemcpy(planets_d, planets, sizeof(float4)*num_planets, cudaMemcpyHostToDevice);
cudaMemcpy(velocities_d, velocities, sizeof(float2)*num_planets, cudaMemcpyHostToDevice);
// Calculating the number of blocks
int num_blocks = num_planets/BLOCK_SIZE + ((num_planets%BLOCK_SIZE == 0) ? 0 : 1);
// Main loop
for(int t = 0; t < num_timesteps; t++){
// TODO 2. Call kernels
update_velocities<<<num_blocks, BLOCK_SIZE>>>(planets_d, velocities_d, num_planets);
update_positions<<<num_blocks, BLOCK_SIZE>>>(planets_d, velocities_d, num_planets);
}
// TODO 3. Transfer data back to host
cudaMemcpy(velocities, velocities_d, sizeof(float2)*num_planets, cudaMemcpyDeviceToHost);
cudaMemcpy(planets, planets_d, sizeof(float4)*num_planets, cudaMemcpyDeviceToHost);
// Output
write_planets(num_timesteps);
}