Tools under development to improve cuda experience in C++
Tested only on Windows
#include "cup/cup.cuh"
#include <iostream>
#include <vector>
template<typename T1, typename T2>
__global__ void kernel(cup::vector<T1, T2>& ints) {
int thread_id = threadIdx.x + blockDim.x * blockIdx.x;
if ( thread_id == 0 ) {
cup::where_am_i(); // prints "Where am I: Device"
// per-thread dynamic allocation on device
cup::device_vector<int> vec {};
vec.reserve(blockDim.x);
vec[2] = 430;
ints[2] = vec[2];
// device side operators
for ( int i = 0; i < ints.capacity(); ++i ) {
printf("vector from host [%d]: %d\n", i, ints[i]);
}
// device side operators
for ( int i = 0; i < vec.capacity(); ++i ) {
printf(" device_vector [%d]: %d\n", i, vec[i]);
}
}
}
int main() {
cup::where_am_i(); // prints "Where am I: Host"
// just works! ints are allocated using cudaMallocManaged
auto std_vec = std::vector<int, cup::managed_allocator<int>>();
std_vec.reserve(8);
auto vec { std::make_unique<cup::vector<int>>() };
auto& vec_ref { *vec };
vec_ref.reserve(4);
vec_ref[1] = -1;
auto t = cup::time_it([&vec_ref]() {
// pass the whole datastructure by reference, just works!
kernel<<<1, 4>>>(vec_ref);
CUDA_CHECK(cudaDeviceSynchronize());
});
printf("Kernel took: %7.3fms\n", t);
for ( int i = 0; i < vec_ref.capacity(); ++i ) {
printf("vector [%d]: %d\n", i, vec_ref[i]);
}
}
/*
Output
Where am I: Host
Where am I: Device
vector from host [0]: 0
vector from host [1]: -1
vector from host [2]: 430
vector from host [3]: 0
device_vector [0]: 0
device_vector [1]: 0
device_vector [2]: 430
device_vector [3]: 0
Kernel took: 3.141ms
vector [0]: 0
vector [1]: -1
vector [2]: 430
vector [3]: 0
*/