Pinned nvidia driver version to fix nightly cuda builds #1485
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Test deployment and reimage on OpenStack | |
on: | |
workflow_dispatch: | |
push: | |
branches: | |
- main | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
pull_request: | |
paths: | |
- '**' | |
- '!dev/**' | |
- 'dev/setup-env.sh' | |
- '!docs/**' | |
- '!README.md' | |
- '!.gitignore' | |
- '!.github/workflows/' | |
- '.github/workflows/stackhpc' | |
jobs: | |
openstack: | |
name: openstack-ci | |
concurrency: | |
group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS | |
cancel-in-progress: true | |
runs-on: ubuntu-22.04 | |
strategy: | |
fail-fast: false # allow other matrix jobs to continue even if one fails | |
matrix: | |
os_version: | |
- RL8 | |
- RL9 | |
env: | |
ANSIBLE_FORCE_COLOR: True | |
OS_CLOUD: openstack | |
TF_VAR_cluster_name: slurmci-${{ matrix.os_version }}-${{ github.run_number }} | |
CI_CLOUD: ${{ vars.CI_CLOUD }} # default from repo settings | |
TF_VAR_os_version: ${{ matrix.os_version }} | |
steps: | |
- uses: actions/checkout@v2 | |
- name: Override CI_CLOUD if PR label is present | |
if: ${{ github.event_name == 'pull_request' }} | |
run: | | |
# Iterate over the labels | |
labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') | |
echo $labels | |
for label in $labels; do | |
if [[ $label == CI_CLOUD=* ]]; then | |
# Extract the value after 'CI_CLOUD=' | |
CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} | |
echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV | |
fi | |
done | |
- name: Record settings for CI cloud | |
run: | | |
echo CI_CLOUD: ${{ env.CI_CLOUD }} | |
- name: Setup ssh | |
run: | | |
set -x | |
mkdir ~/.ssh | |
echo "${{ secrets[format('{0}_SSH_KEY', env.CI_CLOUD)] }}" > ~/.ssh/id_rsa | |
chmod 0600 ~/.ssh/id_rsa | |
shell: bash | |
- name: Add bastion's ssh key to known_hosts | |
run: cat environments/.stackhpc/bastion_fingerprints >> ~/.ssh/known_hosts | |
shell: bash | |
- name: Install ansible etc | |
run: dev/setup-env.sh | |
- name: Install OpenTofu | |
uses: opentofu/setup-opentofu@v1 | |
with: | |
tofu_version: 1.6.2 | |
- name: Initialise terraform | |
run: terraform init | |
working-directory: ${{ github.workspace }}/environments/.stackhpc/terraform | |
- name: Write clouds.yaml | |
run: | | |
mkdir -p ~/.config/openstack/ | |
echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml | |
shell: bash | |
- name: Setup environment-specific inventory/terraform inputs | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook ansible/adhoc/generate-passwords.yml | |
echo vault_testuser_password: "$TESTUSER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml | |
env: | |
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
- name: Provision nodes using fat image | |
id: provision_servers | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
- name: Delete infrastructure if provisioning failed | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: failure() && steps.provision_servers.outcome == 'failure' | |
- name: Configure cluster | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible all -m wait_for_connection | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Run MPI-based tests | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/adhoc/hpctests.yml | |
# - name: Run EESSI tests | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/check_eessi.yml | |
- name: Confirm Open Ondemand is up (via SOCKS proxy) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
# load ansible variables into shell: | |
ansible-playbook ansible/ci/output_vars.yml \ | |
-e output_vars_hosts=openondemand \ | |
-e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ | |
-e output_vars_items=bastion_ip,bastion_user,openondemand_servername | |
source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt | |
# setup ssh proxying: | |
sudo apt-get --yes install proxychains | |
echo proxychains installed | |
ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} | |
echo port 9050 forwarded | |
# check OOD server returns 200: | |
statuscode=$(proxychains wget \ | |
--quiet \ | |
--spider \ | |
--server-response \ | |
--no-check-certificate \ | |
--http-user=testuser \ | |
--http-password=${TESTUSER_PASSWORD} https://${openondemand_servername} \ | |
2>&1) | |
(echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) | |
env: | |
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} | |
# - name: Build environment-specific compute image | |
# id: packer_build | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# cd packer/ | |
# packer init | |
# PACKER_LOG=1 packer build -except openstack.fatimage -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl | |
# ../dev/output_manifest.py packer-manifest.json # Sets NEW_COMPUTE_IMAGE_ID outputs | |
# - name: Test reimage of compute nodes to new environment-specific image (via slurm) | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible login -v -a "sudo scontrol reboot ASAP nextstate=RESUME reason='rebuild image:${{ steps.packer_build.outputs.NEW_COMPUTE_IMAGE_ID }}' ${TF_VAR_cluster_name}-compute-[0-3]" | |
# ansible compute -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down | |
# ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Test reimage of login and control nodes (via rebuild adhoc) | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -v --limit control,login ansible/adhoc/rebuild.yml | |
ansible all -m wait_for_connection -a 'delay=60 timeout=600' # delay allows node to go down | |
ansible-playbook -v ansible/site.yml | |
ansible-playbook -v ansible/ci/check_slurm.yml | |
- name: Check sacct state survived reimage | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_sacct_hpctests.yml | |
- name: Check MPI-based tests are shown in Grafana | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
ansible-playbook -vv ansible/ci/check_grafana.yml | |
- name: Delete infrastructure | |
run: | | |
. venv/bin/activate | |
. environments/.stackhpc/activate | |
cd $APPLIANCES_ENVIRONMENT_ROOT/terraform | |
terraform destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" | |
if: ${{ success() || cancelled() }} | |
# - name: Delete images | |
# run: | | |
# . venv/bin/activate | |
# . environments/.stackhpc/activate | |
# ansible-playbook -vv ansible/ci/delete_images.yml |