Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Develop #71

Merged
merged 11 commits into from
Nov 14, 2024
29 changes: 0 additions & 29 deletions .github/Dockerfile

This file was deleted.

118 changes: 43 additions & 75 deletions .github/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,81 +1,49 @@
version: "3.9"

x-fenix: &fenix
build: &fenix-build
context: ./
dockerfile: .github/Dockerfile
args:
OPENMPI_REPO: open-mpi/ompi
OPENMPI_VERS_PREFIX: tags
OPENMPI_VERS: v5.0.0rc10
#Caches should be manually scoped, or they'll conflict.
x-bake:
cache-from:
- type=gha,scope=default
cache-to:
- type=gha,scope=default,mode=max

services:
#fenix_ompi_5rc10:
# <<: *fenix
# image: "fenix:ompi_5rc10"
# build:
# <<: *fenix-build
# x-bake:
# cache-from:
# - type=gha,scope=ompi_5rc10
# cache-to:
# - type=gha,scope=ompi_5rc10,mode=max

fenix_ompi_5:
<<: *fenix
image: "fenix:ompi_5"
bootstrap:
image: "bootstrap"
build:
<<: *fenix-build
dockerfile_inline: |
FROM spack/ubuntu-jammy:0.22.2
VOLUME /configs
ARG OMPI_VERSION
ENV OMPI_VERSION=$${OMPI_VERSION}
CMD cp /configs/spack.yaml . && \
spack -e . add openmpi@$${OMPI_VERSION} && \
spack -e . containerize >/configs/spack.Dockerfile
args:
- OPENMPI_VERS_PREFIX=heads
- OPENMPI_VERS=v5.0.x
x-bake:
cache-from:
- type=gha,scope=ompi_5
cache-to:
- type=gha,scope=ompi_5,mode=max

fenix_ompi_main:
<<: *fenix
image: "fenix:ompi_main"
OMPI_VERSION: main
no_cache: true
pull_policy: build
volumes:
- .github/:/configs

env:
image: "ghcr.io/sandialabs/fenix/env:main"
build:
<<: *fenix-build
args:
- OPENMPI_VERS_PREFIX=heads
- OPENMPI_VERS=main
x-bake:
cache-from:
- type=gha,scope=ompi_main
cache-to:
- type=gha,scope=ompi_main,mode=max

fenix_icldisco_latest:
<<: *fenix
image: "fenix:icldisco_latest"
# Generated by running the bootstrap image
dockerfile: .github/spack.Dockerfile

fenix:
image: "fenix"
build:
<<: *fenix-build
dockerfile_inline: |
ARG OMPI_VERSION main
FROM ghcr.io/sandialabs/fenix/env:$${OMPI_VERSION}
COPY . /fenix
RUN . /opt/spack-environment/activate.sh && \
mkdir -p /fenix/build && \
cd /fenix/build && \
cmake /fenix \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_C_COMPILER=mpicc \
-DFENIX_EXAMPLES=ON \
-DFENIX_TESTS=ON \
-DMPIEXEC_PREFLAGS="--allow-run-as-root;--map-by;:oversubscribe" && \
make -j

WORKDIR /fenix/build
ENTRYPOINT ["/entrypoint.sh"]
CMD ["ctest", "--output-on-failure", "--timeout", "60"]
args:
- OPENMPI_REPO=icldisco/ompi
- OPENMPI_VERS_PREFIX=heads
- OPENMPI_VERS=ulfm/latest
x-bake:
cache-from:
- type=gha,scope=icldisco_latest
cache-to:
- type=gha,scope=icldisco_latest,mode=max

#fenix_icldisco_experimental:
# <<: *fenix
# image: fenix/icldisco
# build:
# <<: *fenix-build
# args:
# - OPENMPI_REPO=icldisco/ompi
# - OPENMPI_VERS_PREFIX=heads
# - OPENMPI_VERS=ulfm/experimental
OMPI_VERSION: main
pull_policy: build
31 changes: 31 additions & 0 deletions .github/spack.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
spack:
packages:
openmpi:
variants: +internal-hwloc +internal-libevent +internal-pmix
concretizer:
unify: true
reuse: true

container:
format: docker
strip: false
images:
os: ubuntu:22.04
spack: 0.22.2
os_packages:
build:
- build-essential
- autotools-dev
- pkg-config
- python3
- m4
- autoconf
- automake
- flex
- git
- zlib1g-dev
- libperl-dev
- numactl
final:
- build-essential
- cmake
91 changes: 91 additions & 0 deletions .github/workflows/build-env/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
name: Build Environment Image
description: Build the Open MPI environment image for Fenix

inputs:
ompi_version:
description: "Open MPI version to build"
type: string
required: true
token:
description: "GitHub token for logging into GHCR"
type: string
required: true
max_age:
description: "Maximum image age before rebuild, in days"
type: number
required: false
default: 14

runs:
using: "composite"
steps:
- name: Check for valid image
shell: bash
run: |
set +e
IMG=ghcr.io/sandialabs/fenix/env:${{ inputs.ompi_version }}
echo "IMG=$IMG" >> $GITHUB_ENV

docker image rm -f $IMG 2>/dev/null
docker pull $IMG >/dev/null 2>&1
IMG_CREATED=$(docker inspect --type=image --format '{{.Created}}' $IMG 2>/dev/null)
if [ -z "$IMG_CREATED" ]; then
echo "Did not find image $IMG"
echo "found=false" >> $GITHUB_ENV
exit 0
fi

IMG_AGE=$(( ($(date +%s) - $(date -d "$IMG_CREATED" +%s)) / (60*60*24) ))
echo "Found image $IMG created $IMG_AGE days ago"
if [ "$IMG_AGE" -lt ${{ inputs.max_age }} ]; then
echo "Image is valid, skipping build"
echo "found=true" >> $GITHUB_ENV
else
echo "Image is too old, rebuilding"
echo "found=false" >> $GITHUB_ENV
fi

#Remaining actions only run if we didn't find a valid image.
- name: Checkout repository
if: env.found != 'true'
uses: actions/checkout@v3

- name: Set up Docker Buildx
if: env.found != 'true'
uses: docker/setup-buildx-action@v2

- name: Log in to GHCR container registry
if: env.found != 'true'
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ inputs.token }}

- name: Bake the bootstrap docker image
if: env.found != 'true'
uses: docker/bake-action@v5
with:
files: .github/docker-compose.yml
targets: bootstrap
workdir: .
set: |
*.output=type=docker,name=bootstrap
*.args.OMPI_VERSION=${{ inputs.ompi_version }}

- name: Bootstrap the environment Dockerfile
if: env.found != 'true'
shell: bash
run: docker run -v ${GITHUB_WORKSPACE}/.github:/configs bootstrap

- name: Build the environment
if: env.found != 'true'
uses: docker/bake-action@v5
with:
files: .github/docker-compose.yml
targets: env
workdir: .
pull: true
set: |
env.tags=ghcr.io/sandialabs/fenix/env:${{ inputs.ompi_version }}
env.output=type=registry,name=ghcr.io/sandialabs/fenix/env:${{ inputs.ompi_version }}
53 changes: 31 additions & 22 deletions .github/workflows/ci_checks.yaml
Original file line number Diff line number Diff line change
@@ -1,31 +1,40 @@
name: Build & Test

on:
push:
pull_request_target:
types:
- opened
- synchronized
- edited
pull_request:

jobs:
test:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
ompi_version:
- main
- 5.0.3

steps:
- uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- name: Build
uses: docker/bake-action@master
- name: Checkout
uses: actions/checkout@v3

- name: Build the environment image
uses: ./.github/workflows/build-env
with:
ompi_version: ${{ matrix.ompi_version }}
token: ${{ secrets.GITHUB_TOKEN }}
max_age: 14 #days

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v2

- name: Build Fenix
uses: docker/bake-action@v5
with:
files: |
.github/docker-compose.yml
load: true
- name: Test open-mpi v5.0.x
if: success() || failure()
run: docker run fenix:ompi_5
- name: Test open-mpi main
if: success() || failure()
run: docker run fenix:ompi_main
- name: Test icldisco latest
if: success() || failure()
run: docker run fenix:icldisco_latest
files: .github/docker-compose.yml
targets: fenix
set: |
*.output=type=docker,name=fenix
*.args.OMPI_VERSION=${{ matrix.ompi_version }}

- name: Test Fenix
run: docker run fenix
9 changes: 7 additions & 2 deletions include/fenix.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ extern "C" {
#define FENIX_SUCCESS 0
#define FENIX_ERROR_UNINITIALIZED -9
#define FENIX_ERROR_NOCATEGORY -10
#define FENIX_ERROR_CALLBACK_NOT_REGISTERD -11
#define FENIX_ERROR_CALLBACK_NOT_REGISTERED -11
#define FENIX_ERROR_GROUP_CREATE -12
#define FENIX_ERROR_MEMBER_CREATE -13
#define FENIX_ERROR_COMMIT_BARRIER -133
Expand Down Expand Up @@ -105,7 +105,8 @@ extern "C" {
#define FENIX_DATA_SUBSET_CREATED 2

#define FENIX_ERRHANDLER_LOC 1
#define FENIX_DATA_COMMIT_BARRIER_LOC 2
#define FENIX_FINALIZE_LOC 2
#define FENIX_DATA_COMMIT_BARRIER_LOC 4


#define FENIX_DATA_POLICY_IN_MEMORY_RAID 13
Expand Down Expand Up @@ -142,6 +143,8 @@ int Fenix_Initialized(int *);
int Fenix_Callback_register(void (*recover)(MPI_Comm, int, void *),
void *callback_data);

int Fenix_Callback_pop();

int Fenix_get_number_of_ranks_with_role(int, int *);

int Fenix_get_role(MPI_Comm comm, int rank, int *role);
Expand Down Expand Up @@ -228,6 +231,8 @@ int Fenix_Process_fail_list(int** fail_list);

int Fenix_check_cancelled(MPI_Request *request, MPI_Status *status);

int Fenix_Process_detect_failures(int do_recovery);

#if defined(c_plusplus) || defined(__cplusplus)
}
#endif
Expand Down
3 changes: 3 additions & 0 deletions include/fenix_ext.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ typedef struct {
//Manage state of the comms. Necessary when failures happen rapidly, mussing up state
int new_world_exists, user_world_exists;

int dummy_recv_buffer;
MPI_Request check_failures_req;


MPI_Op agree_op; // This is reserved for the global agreement call for Fenix data recovery API

Expand Down
Loading
Loading