Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: trying atomics and tree reduction for CUDA reducer kernels #3123

Merged
merged 13 commits into from
Jun 25, 2024
Merged
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import cupy as cp

cuda_kernel = """
extern "C" {
__global__ void awkward_reduce_argmax_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = -1;
}
}
}

extern "C" {
__global__ void awkward_reduce_argmax_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
extern __shared__ int shared[];

int idx = threadIdx.x;
int thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
shared[idx] = thread_id;
} else {
shared[idx] = -1;
}
__syncthreads();

for (int stride = 1; stride < blockDim.x; stride *= 2) {
int index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = shared[idx - stride];
}
if (index != -1 && (shared[idx] == -1 || fromptr[index] > fromptr[shared[idx]])) {
shared[idx] = index;
}
__syncthreads();
}

if (thread_id < lenparents) {
int parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
partial[blockIdx.x * outlength + parent] = shared[idx];
}
}
}
}

extern "C" {
__global__ void awkward_reduce_argmax_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
int max_index = -1;
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < blocks; ++i) {
int index = partial[i * outlength + thread_id];
if (index != -1 && (max_index == -1 || fromptr[index] > fromptr[max_index])) {
max_index = index;
}
}
toptr[thread_id] = max_index;
}
}
}
"""

parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
lenparents = len(parents)
outlength = int(cp.max(parents)) + 1
toptr = cp.full(outlength, -1, dtype=cp.int32)


block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
for i in range (len(block_size)):
partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
shared_mem_size = block_size[i] * cp.int32().nbytes

raw_module = cp.RawModule(code=cuda_kernel)

awkward_reduce_argmax_a = raw_module.get_function('awkward_reduce_argmax_a')
awkward_reduce_argmax_b = raw_module.get_function('awkward_reduce_argmax_b')
awkward_reduce_argmax_c = raw_module.get_function('awkward_reduce_argmax_c')

awkward_reduce_argmax_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
awkward_reduce_argmax_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
awkward_reduce_argmax_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))

assert cp.array_equal(toptr, cp.array([0, 2, 8, -1, -1, 9]))
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import cupy as cp

cuda_kernel = """
extern "C" {
__global__ void awkward_reduce_argmin_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = -1;
}
}
}

extern "C" {
__global__ void awkward_reduce_argmin_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
extern __shared__ int shared[];

int idx = threadIdx.x;
int thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
shared[idx] = thread_id;
} else {
shared[idx] = -1;
}
__syncthreads();

for (int stride = 1; stride < blockDim.x; stride *= 2) {
int index = -1;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
index = shared[idx - stride];
}
if (index != -1 && (shared[idx] == -1 || fromptr[index] < fromptr[shared[idx]])) {
shared[idx] = index;
}
__syncthreads();
}

if (thread_id < lenparents) {
int parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
partial[blockIdx.x * outlength + parent] = shared[idx];
}
}
}
}

extern "C" {
__global__ void awkward_reduce_argmin_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
int min_index = -1;
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < blocks; ++i) {
int index = partial[i * outlength + thread_id];
if (index != -1 && (min_index == -1 || fromptr[index] < fromptr[min_index])) {
min_index = index;
}
}
toptr[thread_id] = min_index;
}
}
}
"""

parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
fromptr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.int32)
lenparents = len(parents)
outlength = int(cp.max(parents)) + 1
toptr = cp.full(outlength, -1, dtype=cp.int32)


block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
for i in range (len(block_size)):
partial = cp.full((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), -1, dtype=cp.int32)
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
shared_mem_size = block_size[i] * cp.int32().nbytes

raw_module = cp.RawModule(code=cuda_kernel)

awkward_reduce_argmin_a = raw_module.get_function('awkward_reduce_argmin_a')
awkward_reduce_argmin_b = raw_module.get_function('awkward_reduce_argmin_b')
awkward_reduce_argmin_c = raw_module.get_function('awkward_reduce_argmin_c')

awkward_reduce_argmin_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
awkward_reduce_argmin_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
awkward_reduce_argmin_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))

assert cp.array_equal(toptr, cp.array([0, 1, 3, -1, -1, 9]))
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import cupy as cp

cuda_kernel = """
extern "C" {
__global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = 0;
}
}
}

extern "C" {
__global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
extern __shared__ int shared[];

int idx = threadIdx.x;
int thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
shared[idx] = 1;
} else {
shared[idx] = 0;
}
__syncthreads();

for (int stride = 1; stride < blockDim.x; stride *= 2) {
int val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = shared[idx - stride];
}
shared[idx] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
partial[blockIdx.x * outlength + parent] = shared[idx];
}
}
}
}

extern "C" {
__global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
int countnonzero = 0;
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < blocks; ++i) {
countnonzero += partial[i * outlength + thread_id];
}
toptr[thread_id] = countnonzero;
}
}
}
"""

parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
lenparents = len(parents)
outlength = int(cp.max(parents)) + 1
toptr = cp.zeros(outlength, dtype=cp.int32)

block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
for i in range (len(block_size)):
partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
shared_mem_size = block_size[i] * cp.int32().nbytes

raw_module = cp.RawModule(code=cuda_kernel)

awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')

awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))

assert cp.array_equal(toptr, cp.array([1, 2, 6, 0, 0, 1]))
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import cupy as cp

cuda_kernel = """
extern "C" {
__global__ void awkward_reduce_countnonzero_a(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
toptr[thread_id] = 0;
}
}
}

extern "C" {
__global__ void awkward_reduce_countnonzero_b(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
extern __shared__ int shared[];

int idx = threadIdx.x;
int thread_id = blockIdx.x * blockDim.x + idx;

if (thread_id < lenparents) {
shared[idx] = (fromptr[thread_id] != 0) ? 1 : 0;
} else {
shared[idx] = 0;
}
__syncthreads();

for (int stride = 1; stride < blockDim.x; stride *= 2) {
int val = 0;
if (idx >= stride && thread_id < lenparents && parents[thread_id] == parents[thread_id - stride]) {
val = shared[idx - stride];
}
shared[idx] += val;
__syncthreads();
}

if (thread_id < lenparents) {
int parent = parents[thread_id];
if (idx == blockDim.x - 1 || thread_id == lenparents - 1 || parents[thread_id] != parents[thread_id + 1]) {
partial[blockIdx.x * outlength + parent] = shared[idx];
}
}
}
}

extern "C" {
__global__ void awkward_reduce_countnonzero_c(int* toptr, int* fromptr, int* parents, int lenparents, int outlength, int* partial) {
int thread_id = blockIdx.x * blockDim.x + threadIdx.x;

if (thread_id < outlength) {
int countnonzero = 0;
int blocks = (lenparents + blockDim.x - 1) / blockDim.x;
for (int i = 0; i < blocks; ++i) {
countnonzero += partial[i * outlength + thread_id];
}
toptr[thread_id] = countnonzero;
}
}
}
"""

parents = cp.array([0, 1, 1, 2, 2, 2, 2, 2, 2, 5], dtype=cp.int32)
fromptr = cp.array([1, 2, 3, 0, 5, 6, 0, 8, 9, 0], dtype=cp.int32)
lenparents = len(parents)
outlength = int(cp.max(parents)) + 1
toptr = cp.zeros(outlength, dtype=cp.int32)

block_size = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
for i in range (len(block_size)):
partial = cp.zeros((outlength * ((lenparents + block_size[i] - 1) // block_size[i])), dtype=cp.int32)
grid_size = (lenparents + block_size[i] - 1) // block_size[i]
shared_mem_size = block_size[i] * cp.int32().nbytes

raw_module = cp.RawModule(code=cuda_kernel)

awkward_reduce_countnonzero_a = raw_module.get_function('awkward_reduce_countnonzero_a')
awkward_reduce_countnonzero_b = raw_module.get_function('awkward_reduce_countnonzero_b')
awkward_reduce_countnonzero_c = raw_module.get_function('awkward_reduce_countnonzero_c')

awkward_reduce_countnonzero_a((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))
awkward_reduce_countnonzero_b((grid_size,), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial), shared_mem=shared_mem_size)
awkward_reduce_countnonzero_c(((outlength + block_size[i] - 1) // block_size[i],), (block_size[i],), (toptr, fromptr, parents, lenparents, outlength, partial))

assert cp.array_equal(toptr, cp.array([1, 2, 4, 0, 0, 0]))
Loading
Loading