Skip to content

Commit

Permalink
Allow large sizes for vector, matrix, block, tensor and ndim data int…
Browse files Browse the repository at this point in the history
…erfaces, and use proper MPI datatypes to exchange them.
  • Loading branch information
nfurmento committed Oct 15, 2024
1 parent e5a489c commit bdb99f3
Show file tree
Hide file tree
Showing 256 changed files with 2,700 additions and 1,952 deletions.
11 changes: 11 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,17 @@ check:
tags:
- ${NODE}

check_mpich:
extends: .check_template
tags:
- starpu
- ubuntu1804
parallel:
matrix:
- SCRIPT: [./contrib/gitlab/mpich.sh, ./contrib/gitlab/mpich_struct.sh]
script:
- ${SCRIPT}

check_simgrid:
extends: .check_template
tags:
Expand Down
2 changes: 2 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ StarPU 1.5.0
Changes:
* Rename hierarchical tasks in recursive tasks
* Fix asynchronous partitioning with data without home node
* Allow large sizes for vector, matrix, block, tensor and ndim data
interfaces, and use proper MPI datatypes to exchange them.

Small changes:
* Fix build system for StarPU Python interface
Expand Down
6 changes: 6 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,9 @@ if test x$enable_mpi = xmaybe ; then
fi
fi

AC_ARG_ENABLE(mpi-type-vector-c, AC_HELP_STRING([--disable-mpi-type-vector-c], [Disable usage of function MPI_Type_vector_c]),
[enable_mpi_type_vector_c=$enableval], [enable_mpi_type_vector_c=yes])

# in case MPI was explicitly required, but mpicc is not available, this is an error
if test x$enable_mpi = xyes ; then
if test ! -x "$mpicc_path"; then
Expand All @@ -626,6 +629,9 @@ if test x$enable_mpi = xyes ; then
[AC_DEFINE(STARPU_HAVE_MPI_EXT, [1], [<mpi-ext.h> is available])])

AC_CHECK_FUNC([MPI_Comm_create_group], [AC_DEFINE([STARPU_HAVE_MPI_COMM_CREATE_GROUP], [1], [Define to 1 if the function MPI_Comm_create_group is available.])])
if test x$enable_mpi_type_vector_c = xyes ; then
AC_CHECK_FUNC([MPI_Type_vector_c], [AC_DEFINE([STARPU_HAVE_MPI_TYPE_VECTOR_C], [1], [Define to 1 if the function MPI_Type_vector_c is available.])])
fi
CC=$OLD_CC
fi

Expand Down
21 changes: 21 additions & 0 deletions contrib/gitlab/mpich.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/sh
# StarPU --- Runtime system for heterogeneous multicore architectures.
#
# Copyright (C) 2020-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
#
# StarPU is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# StarPU is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the GNU Lesser General Public License in COPYING.LGPL for more details.
#

export STARPU_MICROBENCHS_DISABLED=1
export STARPU_CHECK_DIRS=mpi
export STARPU_USER_CONFIGURE_OPTIONS="--with-mpicc=/usr/bin/mpicc.mpich --with-mpiexec=/usr/bin/mpiexec.mpich --with-mpicxx=/usr/bin/mpicxx.mpich --with-mpifort=/usr/bin/mpifort.mpich"
./contrib/ci.inria.fr/job-1-check.sh
21 changes: 21 additions & 0 deletions contrib/gitlab/mpich_struct.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/sh
# StarPU --- Runtime system for heterogeneous multicore architectures.
#
# Copyright (C) 2020-2024 University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
#
# StarPU is free software; you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation; either version 2.1 of the License, or (at
# your option) any later version.
#
# StarPU is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
#
# See the GNU Lesser General Public License in COPYING.LGPL for more details.
#

export STARPU_MICROBENCHS_DISABLED=1
export STARPU_CHECK_DIRS=mpi
export STARPU_USER_CONFIGURE_OPTIONS="--with-mpicc=/usr/bin/mpicc.mpich --with-mpiexec=/usr/bin/mpiexec.mpich --with-mpicxx=/usr/bin/mpicxx.mpich --with-mpifort=/usr/bin/mpifort.mpich --disable-mpi-type-vector-c"
./contrib/ci.inria.fr/job-1-check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
//! [Extract To be included. You should update doxygen if you see this text.]
struct starpu_vector_interface *vector = buffers[0];
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
unsigned n = STARPU_VECTOR_GET_NX(vector);
size_t n = STARPU_VECTOR_GET_NX(vector);
//! [Extract To be included. You should update doxygen if you see this text.]

//! [Unpack To be included. You should update doxygen if you see this text.]
Expand All @@ -33,7 +33,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
//! [Unpack To be included. You should update doxygen if you see this text.]

//! [Compute To be included. You should update doxygen if you see this text.]
unsigned i;
size_t i;
for (i = 0; i < n; i++)
val[i] *= factor;
//! [Compute To be included. You should update doxygen if you see this text.]
Expand Down
4 changes: 2 additions & 2 deletions doc/doxygen/chapters/starpu_basics/basic_examples.doxy
Original file line number Diff line number Diff line change
Expand Up @@ -349,11 +349,11 @@ The definition of the codelet can be written as follows:
\code{.c}
void scal_cpu_func(void *buffers[], void *cl_arg)
{
unsigned i;
size_t i;
float *factor = cl_arg;

/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
/* CPU copy of the vector pointer */
float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);

Expand Down
12 changes: 6 additions & 6 deletions doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
/* This kernel takes a buffer and scales it by a constant factor */
void scal_cpu_func(void *buffers[], void *cl_arg)
{
unsigned i;
size_t i;
float *factor = cl_arg;

/*
Expand All @@ -38,7 +38,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
struct starpu_vector_interface *vector = buffers[0];

/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(vector);
size_t n = STARPU_VECTOR_GET_NX(vector);

/* get a pointer to the local copy of the vector: note that we have to
* cast it in (float *) since a vector could contain any type of
Expand All @@ -53,22 +53,22 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
void scal_sse_func(void *buffers[], void *cl_arg)
{
float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
unsigned int n_iterations = n/4;
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n_iterations = n/4;

__m128 *VECTOR = (__m128*) vector;
__m128 FACTOR STARPU_ATTRIBUTE_ALIGNED(16);
float factor = *(float *) cl_arg;
FACTOR = _mm_set1_ps(factor);

unsigned int i;
size_t i;
for (i = 0; i < n_iterations; i++)
VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);

unsigned int remainder = n%4;
if (remainder != 0)
{
unsigned int start = 4 * n_iterations;
size_t start = 4 * n_iterations;
for (i = start; i < start+remainder; ++i)
{
vector[i] = factor * vector[i];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
//! [To be included. You should update doxygen if you see this text.]
#include <starpu.h>

static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
static __global__ void vector_mult_cuda(size_t n, float *val, float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
Expand All @@ -28,7 +28,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
float *factor = (float *)_args;

/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned threads_per_block = 64;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void scal_opencl_func(void *buffers[], void *_args)
cl_event event; /* OpenCL specific code */

/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
/* OpenCL copy of the vector pointer */
cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);

Expand Down
10 changes: 5 additions & 5 deletions doc/doxygen/chapters/starpu_extensions/advanced_tasks.doxy
Original file line number Diff line number Diff line change
Expand Up @@ -140,16 +140,16 @@ to use SSE to scale a vector. The codelet can be written as follows:
void scal_sse_func(void *buffers[], void *cl_arg)
{
float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
unsigned int n_iterations = n/4;
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n_iterations = n/4;
if (n % 4 != 0)
n_iterations++;

__m128 *VECTOR = (__m128*) vector;
__m128 factor __attribute__((aligned(16)));
factor = _mm_set1_ps(*(float *) cl_arg);

unsigned int i;
size_t i;
for (i = 0; i < n_iterations; i++)
VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
}
Expand Down Expand Up @@ -352,10 +352,10 @@ the rank of the current CPU within the combined worker. For instance:
\code{.c}
static void func(void *buffers[], void *args)
{
unsigned i;
size_t i;
float *factor = _args;
struct starpu_vector_interface *vector = buffers[0];
unsigned n = STARPU_VECTOR_GET_NX(vector);
size_t n = STARPU_VECTOR_GET_NX(vector);
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

/* Compute slice to compute */
Expand Down
4 changes: 2 additions & 2 deletions doc/doxygen/chapters/starpu_extensions/code/forkmode.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@
//! [To be included. You should update doxygen if you see this text.]
void scal_cpu_func(void *buffers[], void *_args)
{
unsigned i;
size_t i;
float *factor = _args;
struct starpu_vector_interface *vector = buffers[0];
unsigned n = STARPU_VECTOR_GET_NX(vector);
size_t n = STARPU_VECTOR_GET_NX(vector);
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);

#pragma omp parallel for num_threads(starpu_combined_worker_get_size())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ void fpga_add(void *buffers[], void *cl_arg)
int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);

int size = STARPU_VECTOR_GET_NX(buffers[0]);
size_t size = STARPU_VECTOR_GET_NX(buffers[0]);

/* actions to run on an engine */
max_actions_t *act = max_actions_init(maxfile, NULL);
Expand Down
4 changes: 2 additions & 2 deletions doc/doxygen/chapters/starpu_extensions/recursive_tasks.doxy
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ perform the computation.
void func_cpu(void *descr[], void *_args)
{
(void) _args;
int x;
int nx = STARPU_VECTOR_GET_NX(descr[0]);
size_t x;
size_t nx = STARPU_VECTOR_GET_NX(descr[0]);
TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);

for(x=0 ; x<nx ; x++)
Expand Down
4 changes: 2 additions & 2 deletions doc/tutorial/vector_scal_cpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
/* This kernel takes a buffer and scales it by a constant factor */
void vector_scal_cpu(void *buffers[], void *cl_arg)
{
unsigned i;
size_t i;
float factor;

/*
Expand All @@ -35,7 +35,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
struct starpu_vector_interface *vector = buffers[0];

/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(vector);
size_t n = STARPU_VECTOR_GET_NX(vector);

/* get a pointer to the local copy of the vector : note that we have to
* cast it in (float *) since a vector could contain any type of
Expand Down
4 changes: 2 additions & 2 deletions doc/tutorial/vector_scal_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include <starpu.h>

static __global__ void vector_mult_cuda(float *val, unsigned int n, float factor)
static __global__ void vector_mult_cuda(float *val, size_t n, float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
Expand All @@ -29,7 +29,7 @@ extern "C" void vector_scal_cuda(void *buffers[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &factor);

/* length of the vector */
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned threads_per_block = 64;
Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/vector_scal_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ void vector_scal_opencl(void *buffers[], void *cl_arg)
starpu_codelet_unpack_args(cl_arg, &factor);

/* length of the vector */
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
cl_ulong n = STARPU_VECTOR_GET_NX(buffers[0]);
/* OpenCL copy of the vector pointer */
cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);

Expand Down
2 changes: 1 addition & 1 deletion doc/tutorial/vector_scal_opencl_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/

__kernel void vector_mult_opencl(unsigned int nx, __global float* val, float factor)
__kernel void vector_mult_opencl(ulong nx, __global float* val, float factor)
{
const int i = get_global_id(0);
if (i < nx)
Expand Down
5 changes: 3 additions & 2 deletions eclipse-plugin/examples/hello/hello.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
void display_cpu_func(void *buffers[], void *cl_arg)
{
(void)cl_arg;
int nx, i;
size_t nx, i;
struct starpu_vector_interface *vector;
int *val;

Expand All @@ -35,7 +35,8 @@ void display_cpu_func(void *buffers[], void *cl_arg)

void scal_cpu_func(void *buffers[], void *cl_arg)
{
int factor, nx, i;
int factor;
size_t nx, i;
struct starpu_vector_interface *vector;
int *val;

Expand Down
4 changes: 2 additions & 2 deletions examples/axpy/axpy.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void axpy_cpu(void *descr[], void *arg)
{
TYPE alpha = *((TYPE *)arg);

unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
size_t n = STARPU_VECTOR_GET_NX(descr[0]);

TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
Expand All @@ -68,7 +68,7 @@ void axpy_gpu(void *descr[], void *arg)
{
TYPE alpha = *((TYPE *)arg);

unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
size_t n = STARPU_VECTOR_GET_NX(descr[0]);

TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
Expand Down
6 changes: 3 additions & 3 deletions examples/axpy/axpy_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ void axpy_opencl(void *buffers[], void *_args)
cl_kernel kernel;
cl_command_queue queue;

unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
cl_ulong n = STARPU_VECTOR_GET_NX(buffers[0]);
cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
unsigned x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
cl_ulong x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
unsigned y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
cl_ulong y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);

id = starpu_worker_get_id_check();
devid = starpu_worker_get_devid(id);
Expand Down
6 changes: 3 additions & 3 deletions examples/axpy/axpy_opencl_kernel.cl
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
#include "axpy.h"

__kernel void _axpy_opencl(__global TYPE *x,
unsigned x_offset,
ulong x_offset,
__global TYPE *y,
unsigned y_offset,
unsigned nx,
ulong y_offset,
ulong nx,
TYPE alpha)
{
const int i = get_global_id(0);
Expand Down
Loading

0 comments on commit bdb99f3

Please sign in to comment.