Allow large sizes for vector, matrix, block, tensor and ndim data int…

…erfaces, and use proper MPI datatypes to exchange them.
starpu-runtime · Oct 15, 2024 · bdb99f3 · bdb99f3
1 parent e5a489c
commit bdb99f3
Show file tree

Hide file tree

Showing 256 changed files with 2,700 additions and 1,952 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -65,6 +65,17 @@ check:
   tags:
     - ${NODE}
 
+check_mpich:
+  extends: .check_template
+  tags:
+    - starpu
+    - ubuntu1804
+  parallel:
+    matrix:
+      - SCRIPT: [./contrib/gitlab/mpich.sh, ./contrib/gitlab/mpich_struct.sh]
+  script:
+    - ${SCRIPT}
+
 check_simgrid:
   extends: .check_template
   tags:

diff --git a/ChangeLog b/ChangeLog
@@ -19,6 +19,8 @@ StarPU 1.5.0
 Changes:
    * Rename hierarchical tasks in recursive tasks
    * Fix asynchronous partitioning with data without home node
+   * Allow large sizes for vector, matrix, block, tensor and ndim data
+     interfaces, and use proper MPI datatypes to exchange them.
 
 Small changes:
   * Fix build system for StarPU Python interface

diff --git a/configure.ac b/configure.ac
@@ -610,6 +610,9 @@ if test x$enable_mpi = xmaybe ; then
     fi
 fi
 
+AC_ARG_ENABLE(mpi-type-vector-c, AC_HELP_STRING([--disable-mpi-type-vector-c], [Disable usage of function MPI_Type_vector_c]),
+	      [enable_mpi_type_vector_c=$enableval], [enable_mpi_type_vector_c=yes])
+
 # in case MPI was explicitly required, but mpicc is not available, this is an error
 if test x$enable_mpi = xyes ; then
 	if test ! -x "$mpicc_path"; then
@@ -626,6 +629,9 @@ if test x$enable_mpi = xyes ; then
 		[AC_DEFINE(STARPU_HAVE_MPI_EXT, [1], [<mpi-ext.h> is available])])
 
 	AC_CHECK_FUNC([MPI_Comm_create_group], [AC_DEFINE([STARPU_HAVE_MPI_COMM_CREATE_GROUP], [1], [Define to 1 if the function MPI_Comm_create_group is available.])])
+	if test x$enable_mpi_type_vector_c = xyes ; then
+		AC_CHECK_FUNC([MPI_Type_vector_c], [AC_DEFINE([STARPU_HAVE_MPI_TYPE_VECTOR_C], [1], [Define to 1 if the function MPI_Type_vector_c is available.])])
+	fi
 	CC=$OLD_CC
 fi
 

diff --git a/contrib/gitlab/mpich.sh b/contrib/gitlab/mpich.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020-2024   University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export STARPU_MICROBENCHS_DISABLED=1
+export STARPU_CHECK_DIRS=mpi
+export STARPU_USER_CONFIGURE_OPTIONS="--with-mpicc=/usr/bin/mpicc.mpich --with-mpiexec=/usr/bin/mpiexec.mpich --with-mpicxx=/usr/bin/mpicxx.mpich --with-mpifort=/usr/bin/mpifort.mpich"
+./contrib/ci.inria.fr/job-1-check.sh
diff --git a/contrib/gitlab/mpich_struct.sh b/contrib/gitlab/mpich_struct.sh
@@ -0,0 +1,21 @@
+#!/bin/sh
+# StarPU --- Runtime system for heterogeneous multicore architectures.
+#
+# Copyright (C) 2020-2024   University of Bordeaux, CNRS (LaBRI UMR 5800), Inria
+#
+# StarPU is free software; you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation; either version 2.1 of the License, or (at
+# your option) any later version.
+#
+# StarPU is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the GNU Lesser General Public License in COPYING.LGPL for more details.
+#
+
+export STARPU_MICROBENCHS_DISABLED=1
+export STARPU_CHECK_DIRS=mpi
+export STARPU_USER_CONFIGURE_OPTIONS="--with-mpicc=/usr/bin/mpicc.mpich --with-mpiexec=/usr/bin/mpiexec.mpich --with-mpicxx=/usr/bin/mpicxx.mpich --with-mpifort=/usr/bin/mpifort.mpich --disable-mpi-type-vector-c"
+./contrib/ci.inria.fr/job-1-check.sh
diff --git a/doc/doxygen/chapters/starpu_applications/code/vector_scal_cpu.c b/doc/doxygen/chapters/starpu_applications/code/vector_scal_cpu.c
@@ -24,7 +24,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
 //! [Extract To be included. You should update doxygen if you see this text.]
 	struct starpu_vector_interface *vector = buffers[0];
 	float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
-	unsigned n = STARPU_VECTOR_GET_NX(vector);
+	size_t n = STARPU_VECTOR_GET_NX(vector);
 //! [Extract To be included. You should update doxygen if you see this text.]
 
 //! [Unpack To be included. You should update doxygen if you see this text.]
@@ -33,7 +33,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
 //! [Unpack To be included. You should update doxygen if you see this text.]
 
 //! [Compute To be included. You should update doxygen if you see this text.]
-	unsigned i;
+	size_t i;
 	for (i = 0; i < n; i++)
 		val[i] *= factor;
 //! [Compute To be included. You should update doxygen if you see this text.]

diff --git a/doc/doxygen/chapters/starpu_basics/basic_examples.doxy b/doc/doxygen/chapters/starpu_basics/basic_examples.doxy
@@ -349,11 +349,11 @@ The definition of the codelet can be written as follows:
 \code{.c}
 void scal_cpu_func(void *buffers[], void *cl_arg)
 {
-    unsigned i;
+    size_t i;
     float *factor = cl_arg;
 
     /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* CPU copy of the vector pointer */
     float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
 

diff --git a/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_cpu.c b/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_cpu.c
@@ -22,7 +22,7 @@
 /* This kernel takes a buffer and scales it by a constant factor */
 void scal_cpu_func(void *buffers[], void *cl_arg)
 {
-    unsigned i;
+    size_t i;
     float *factor = cl_arg;
 
     /*
@@ -38,7 +38,7 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
     struct starpu_vector_interface *vector = buffers[0];
 
     /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    size_t n = STARPU_VECTOR_GET_NX(vector);
 
     /* get a pointer to the local copy of the vector: note that we have to
      * cast it in (float *) since a vector could contain any type of
@@ -53,22 +53,22 @@ void scal_cpu_func(void *buffers[], void *cl_arg)
 void scal_sse_func(void *buffers[], void *cl_arg)
 {
     float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
-    unsigned int n_iterations = n/4;
+    size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
+    size_t n_iterations = n/4;
 
     __m128 *VECTOR = (__m128*) vector;
     __m128 FACTOR STARPU_ATTRIBUTE_ALIGNED(16);
     float factor = *(float *) cl_arg;
     FACTOR = _mm_set1_ps(factor);
 
-    unsigned int i;
+    size_t i;
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(FACTOR, VECTOR[i]);
 
     unsigned int remainder = n%4;
     if (remainder != 0)
     {
-        unsigned int start = 4 * n_iterations;
+        size_t start = 4 * n_iterations;
         for (i = start; i < start+remainder; ++i)
         {
             vector[i] = factor * vector[i];

diff --git a/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_cuda.c b/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_cuda.c
@@ -16,7 +16,7 @@
 //! [To be included. You should update doxygen if you see this text.]
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(unsigned n, float *val, float factor)
+static __global__ void vector_mult_cuda(size_t n, float *val, float factor)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
@@ -28,7 +28,7 @@ extern "C" void scal_cuda_func(void *buffers[], void *_args)
         float *factor = (float *)_args;
 
         /* length of the vector */
-        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+        size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
         /* local copy of the vector pointer */
         float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
         unsigned threads_per_block = 64;

diff --git a/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_opencl.c b/doc/doxygen/chapters/starpu_basics/code/basics_vector_scal_opencl.c
@@ -28,7 +28,7 @@ void scal_opencl_func(void *buffers[], void *_args)
     cl_event event;                       /* OpenCL specific code */
 
     /* length of the vector */
-    unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+    size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
     /* OpenCL copy of the vector pointer */
     cl_mem val = (cl_mem)STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 

diff --git a/doc/doxygen/chapters/starpu_extensions/advanced_tasks.doxy b/doc/doxygen/chapters/starpu_extensions/advanced_tasks.doxy
@@ -140,16 +140,16 @@ to use SSE to scale a vector. The codelet can be written as follows:
 void scal_sse_func(void *buffers[], void *cl_arg)
 {
     float *vector = (float *) STARPU_VECTOR_GET_PTR(buffers[0]);
-    unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
-    unsigned int n_iterations = n/4;
+    size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
+    size_t n_iterations = n/4;
     if (n % 4 != 0)
         n_iterations++;
 
     __m128 *VECTOR = (__m128*) vector;
     __m128 factor __attribute__((aligned(16)));
     factor = _mm_set1_ps(*(float *) cl_arg);
 
-    unsigned int i;
+    size_t i;
     for (i = 0; i < n_iterations; i++)
         VECTOR[i] = _mm_mul_ps(factor, VECTOR[i]);
 }
@@ -352,10 +352,10 @@ the rank of the current CPU within the combined worker. For instance:
 \code{.c}
 static void func(void *buffers[], void *args)
 {
-    unsigned i;
+    size_t i;
     float *factor = _args;
     struct starpu_vector_interface *vector = buffers[0];
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    size_t n = STARPU_VECTOR_GET_NX(vector);
     float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
 
     /* Compute slice to compute */

diff --git a/doc/doxygen/chapters/starpu_extensions/code/forkmode.c b/doc/doxygen/chapters/starpu_extensions/code/forkmode.c
@@ -17,10 +17,10 @@
 //! [To be included. You should update doxygen if you see this text.]
 void scal_cpu_func(void *buffers[], void *_args)
 {
-    unsigned i;
+    size_t i;
     float *factor = _args;
     struct starpu_vector_interface *vector = buffers[0];
-    unsigned n = STARPU_VECTOR_GET_NX(vector);
+    size_t n = STARPU_VECTOR_GET_NX(vector);
     float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
 
 #pragma omp parallel for num_threads(starpu_combined_worker_get_size())

diff --git a/doc/doxygen/chapters/starpu_extensions/max_fpga_support.doxy b/doc/doxygen/chapters/starpu_extensions/max_fpga_support.doxy
@@ -183,7 +183,7 @@ void fpga_add(void *buffers[], void *cl_arg)
     int *b = (int*) STARPU_VECTOR_GET_PTR(buffers[1]);
     int *c = (int*) STARPU_VECTOR_GET_PTR(buffers[2]);
 
-    int size = STARPU_VECTOR_GET_NX(buffers[0]);
+    size_t size = STARPU_VECTOR_GET_NX(buffers[0]);
 
     /* actions to run on an engine */
     max_actions_t *act = max_actions_init(maxfile, NULL);

diff --git a/doc/doxygen/chapters/starpu_extensions/recursive_tasks.doxy b/doc/doxygen/chapters/starpu_extensions/recursive_tasks.doxy
@@ -56,8 +56,8 @@ perform the computation.
 void func_cpu(void *descr[], void *_args)
 {
 	(void) _args;
-	int x;
-	int nx = STARPU_VECTOR_GET_NX(descr[0]);
+	size_t x;
+	size_t nx = STARPU_VECTOR_GET_NX(descr[0]);
 	TYPE *v = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 
 	for(x=0 ; x<nx ; x++)

diff --git a/doc/tutorial/vector_scal_cpu.c b/doc/tutorial/vector_scal_cpu.c
@@ -19,7 +19,7 @@
 /* This kernel takes a buffer and scales it by a constant factor */
 void vector_scal_cpu(void *buffers[], void *cl_arg)
 {
-	unsigned i;
+	size_t i;
 	float factor;
 
 	/*
@@ -35,7 +35,7 @@ void vector_scal_cpu(void *buffers[], void *cl_arg)
 	struct starpu_vector_interface *vector = buffers[0];
 
 	/* length of the vector */
-	unsigned n = STARPU_VECTOR_GET_NX(vector);
+	size_t n = STARPU_VECTOR_GET_NX(vector);
 
 	/* get a pointer to the local copy of the vector : note that we have to
 	 * cast it in (float *) since a vector could contain any type of

diff --git a/doc/tutorial/vector_scal_cuda.cu b/doc/tutorial/vector_scal_cuda.cu
@@ -16,7 +16,7 @@
 
 #include <starpu.h>
 
-static __global__ void vector_mult_cuda(float *val, unsigned int n, float factor)
+static __global__ void vector_mult_cuda(float *val, size_t n, float factor)
 {
         unsigned i =  blockIdx.x*blockDim.x + threadIdx.x;
         if (i < n)
@@ -29,7 +29,7 @@ extern "C" void vector_scal_cuda(void *buffers[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &factor);
 
         /* length of the vector */
-        unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+        size_t n = STARPU_VECTOR_GET_NX(buffers[0]);
         /* local copy of the vector pointer */
         float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
         unsigned threads_per_block = 64;

diff --git a/doc/tutorial/vector_scal_opencl.c b/doc/tutorial/vector_scal_opencl.c
@@ -29,7 +29,7 @@ void vector_scal_opencl(void *buffers[], void *cl_arg)
 	starpu_codelet_unpack_args(cl_arg, &factor);
 
 	/* length of the vector */
-	unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
+	cl_ulong n = STARPU_VECTOR_GET_NX(buffers[0]);
 	/* OpenCL copy of the vector pointer */
 	cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
 

diff --git a/doc/tutorial/vector_scal_opencl_kernel.cl b/doc/tutorial/vector_scal_opencl_kernel.cl
@@ -14,7 +14,7 @@
  * See the GNU Lesser General Public License in COPYING.LGPL for more details.
  */
 
-__kernel void vector_mult_opencl(unsigned int nx, __global float* val, float factor)
+__kernel void vector_mult_opencl(ulong nx, __global float* val, float factor)
 {
         const int i = get_global_id(0);
         if (i < nx)

diff --git a/eclipse-plugin/examples/hello/hello.c b/eclipse-plugin/examples/hello/hello.c
@@ -21,7 +21,7 @@
 void display_cpu_func(void *buffers[], void *cl_arg)
 {
 	(void)cl_arg;
-	int nx, i;
+	size_t nx, i;
 	struct starpu_vector_interface *vector;
 	int *val;
 
@@ -35,7 +35,8 @@ void display_cpu_func(void *buffers[], void *cl_arg)
 
 void scal_cpu_func(void *buffers[], void *cl_arg)
 {
-	int factor, nx, i;
+	int factor;
+	size_t nx, i;
 	struct starpu_vector_interface *vector;
 	int *val;
 

diff --git a/examples/axpy/axpy.c b/examples/axpy/axpy.c
@@ -55,7 +55,7 @@ void axpy_cpu(void *descr[], void *arg)
 {
 	TYPE alpha = *((TYPE *)arg);
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	size_t n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);
@@ -68,7 +68,7 @@ void axpy_gpu(void *descr[], void *arg)
 {
 	TYPE alpha = *((TYPE *)arg);
 
-	unsigned n = STARPU_VECTOR_GET_NX(descr[0]);
+	size_t n = STARPU_VECTOR_GET_NX(descr[0]);
 
 	TYPE *block_x = (TYPE *)STARPU_VECTOR_GET_PTR(descr[0]);
 	TYPE *block_y = (TYPE *)STARPU_VECTOR_GET_PTR(descr[1]);

diff --git a/examples/axpy/axpy_opencl.c b/examples/axpy/axpy_opencl.c
@@ -29,11 +29,11 @@ void axpy_opencl(void *buffers[], void *_args)
 	cl_kernel kernel;
 	cl_command_queue queue;
 
-	unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
+	cl_ulong n = STARPU_VECTOR_GET_NX(buffers[0]);
 	cl_mem x = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
-	unsigned x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
+	cl_ulong x_offset = STARPU_VECTOR_GET_OFFSET(buffers[0]);
 	cl_mem y = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[1]);
-	unsigned y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
+	cl_ulong y_offset = STARPU_VECTOR_GET_OFFSET(buffers[1]);
 
 	id = starpu_worker_get_id_check();
 	devid = starpu_worker_get_devid(id);

diff --git a/examples/axpy/axpy_opencl_kernel.cl b/examples/axpy/axpy_opencl_kernel.cl
@@ -19,10 +19,10 @@
 #include "axpy.h"
 
 __kernel void _axpy_opencl(__global TYPE *x,
-			   unsigned x_offset,
+			   ulong x_offset,
 			   __global TYPE *y,
-			   unsigned y_offset,
-			   unsigned nx,
+			   ulong y_offset,
+			   ulong nx,
 			   TYPE alpha)
 {
         const int i = get_global_id(0);