diff --git a/Dockerfile b/Dockerfile index ff2ae2704..6016d1aee 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,13 @@ # syntax=docker/dockerfile:1 -ARG FULL_STACK_VER=2024.1022 +ARG FULL_STACK_VER=2024.1023 ARG UV_VER=0.4.7 ARG QE_VER=7.2 ARG QE_DIR=/opt/conda/envs/quantum-espresso-${QE_VER} +ARG HQ_VER=0.19.0 ARG UV_CACHE_DIR=/tmp/uv_cache ARG QE_APP_SRC=/tmp/quantum-espresso +ARG HQ_COMPUTER="localhost-hq" FROM ghcr.io/astral-sh/uv:${UV_VER} AS uv @@ -43,22 +45,44 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \ # STAGE 3 # - Prepare AiiDA profile and localhost computer +# - Prepare hq computer using hyperqueue as scheduler # - Install QE codes and pseudopotentials # - Archive home folder FROM build_deps AS home_build ARG QE_DIR +ARG HQ_VER +ARG HQ_COMPUTER + +# Install hq binary +RUN wget -c -O hq.tar.gz https://github.com/It4innovations/hyperqueue/releases/download/v${HQ_VER}/hq-v${HQ_VER}-linux-x64.tar.gz && \ + tar xf hq.tar.gz -C /opt/conda/ + ENV PSEUDO_FOLDER=/tmp/pseudo RUN mkdir -p ${PSEUDO_FOLDER} && \ python -m aiidalab_qe download-pseudos --dest ${PSEUDO_FOLDER} +ENV UV_CONSTRAINT=${PIP_CONSTRAINT} +# Install the aiida-hyperqueue +# XXX: fix me after release aiida-hyperqueue +RUN --mount=from=uv,source=/uv,target=/bin/uv \ + --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ + uv pip install --system --strict --cache-dir=${UV_CACHE_DIR} \ + "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue" + +COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/ + +ENV HQ_COMPUTER=$HQ_COMPUTER + # TODO: Remove PGSQL and daemon log files, and other unneeded files RUN --mount=from=qe_conda_env,source=${QE_DIR},target=${QE_DIR} \ bash /usr/local/bin/before-notebook.d/20_start-postgresql.sh && \ bash /usr/local/bin/before-notebook.d/40_prepare-aiida.sh && \ - python -m aiidalab_qe install-qe && \ + bash /usr/local/bin/before-notebook.d/42_setup-hq-computer.sh && \ + python -m aiidalab_qe install-qe --computer ${HQ_COMPUTER} && \ python -m aiidalab_qe install-pseudos --source ${PSEUDO_FOLDER} && \ verdi daemon stop && \ mamba run -n aiida-core-services pg_ctl stop && \ + touch /home/${NB_USER}/.FLAG_HOME_INITIALIZED && \ cd /home/${NB_USER} && tar -cf /opt/conda/home.tar . # STAGE 3 - Final stage @@ -71,22 +95,31 @@ FROM ghcr.io/aiidalab/full-stack:${FULL_STACK_VER} ARG QE_DIR ARG QE_APP_SRC ARG UV_CACHE_DIR +ARG HQ_COMPUTER USER ${NB_USER} WORKDIR /tmp # Install python dependencies # Use uv cache from the previous build step +# # Install the aiida-hyperqueue +# # XXX: fix me after release aiida-hyperqueue ENV UV_CONSTRAINT=${PIP_CONSTRAINT} RUN --mount=from=uv,source=/uv,target=/bin/uv \ --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \ --mount=from=build_deps,source=${QE_APP_SRC},target=${QE_APP_SRC},rw \ - uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC} + uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC} "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue" + +# copy hq binary +COPY --from=home_build /opt/conda/hq /usr/local/bin/ COPY --from=qe_conda_env ${QE_DIR} ${QE_DIR} USER root + COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/ +ENV HQ_COMPUTER=$HQ_COMPUTER + # Remove content of $HOME # '-mindepth=1' ensures that we do not remove the home directory itself. RUN find /home/${NB_USER}/ -mindepth 1 -delete diff --git a/before-notebook.d/00_untar_home.sh b/before-notebook.d/00_untar-home.sh similarity index 61% rename from before-notebook.d/00_untar_home.sh rename to before-notebook.d/00_untar-home.sh index d911474e8..d55902280 100644 --- a/before-notebook.d/00_untar_home.sh +++ b/before-notebook.d/00_untar-home.sh @@ -4,7 +4,7 @@ set -eux home="/home/${NB_USER}" # Untar home archive file to restore home directory if it is empty -if [[ $(ls -A ${home} | wc -l) = "0" ]]; then +if [ ! -e $home/.FLAG_HOME_INITIALIZED ]; then if [[ ! -f $HOME_TAR ]]; then echo "File $HOME_TAR does not exist!" exit 1 @@ -15,12 +15,20 @@ if [[ $(ls -A ${home} | wc -l) = "0" ]]; then fi echo "Extracting $HOME_TAR to $home" + # NOTE: a tar error when deployed to k8s but at the momment not cause any issue + # tar: .: Cannot utime: Operation not permitted + # tar: .: Cannot change mode to rwxr-s---: Operation not permitted tar -xf $HOME_TAR -C "$home" - - echo "Copying directory '$QE_APP_FOLDER' to '$AIIDALAB_APPS'" - cp -r "$QE_APP_FOLDER" "$AIIDALAB_APPS" else echo "$home folder is not empty!" ls -lrta "$home" fi + +if [ -d $AIIDALAB_APPS/quantum-espresso ]; then + echo "Quantum ESPRESSO app does exist" +else + echo "Copying directory '$QE_APP_FOLDER' to '$AIIDALAB_APPS'" + cp -r "$QE_APP_FOLDER" "$AIIDALAB_APPS" +fi + set +eux diff --git a/before-notebook.d/42_setup-hq-computer.sh b/before-notebook.d/42_setup-hq-computer.sh new file mode 100755 index 000000000..7031ced00 --- /dev/null +++ b/before-notebook.d/42_setup-hq-computer.sh @@ -0,0 +1,21 @@ +#!/bin/bash + +set -x + +# computer +verdi computer show ${HQ_COMPUTER} || verdi computer setup \ + --non-interactive \ + --label "${HQ_COMPUTER}" \ + --description "local computer with hyperqueue scheduler" \ + --hostname "localhost" \ + --transport core.local \ + --scheduler hyperqueue \ + --work-dir /home/${NB_USER}/aiida_run/ \ + --mpirun-command "mpirun -np {num_cpus}" + +verdi computer configure core.local "${HQ_COMPUTER}" \ + --non-interactive \ + --safe-interval 5.0 + +# disable the localhost which is set in base image +verdi computer disable localhost aiida@localhost diff --git a/before-notebook.d/43_start-hq.sh b/before-notebook.d/43_start-hq.sh new file mode 100644 index 000000000..c20a462e4 --- /dev/null +++ b/before-notebook.d/43_start-hq.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +set -x + +# NOTE: this cgroup folder hierachy is based on cgroupv2 +# if the container is open in system which has cgroupv1 the image build procedure will fail. +# Since the image is mostly for demo server where we know the machine and OS I supposed +# it should have cgroupv2 (> Kubernetes v1.25). +# We only build the server for demo server so it does not require user to have new cgroup. +# But for developers, please update your cgroup version to v2. +# See: https://kubernetes.io/docs/concepts/architecture/cgroups/#using-cgroupv2 + +# computer memory from runtime +MEMORY_LIMIT=$(cat /sys/fs/cgroup/memory.max) + +if [ "$MEMORY_LIMIT" = "max" ]; then + MEMORY_LIMIT=4096 + echo "No memory limit set, use 4GiB" +else + MEMORY_LIMIT=$(echo "scale=0; $MEMORY_LIMIT / (1024 * 1024)" | bc) + echo "Memory Limit: ${MEMORY_LIMIT} MiB" +fi + +# Compute number of cpus allocated to the container +CPU_LIMIT=$(awk '{print $1}' /sys/fs/cgroup/cpu.max) +CPU_PERIOD=$(awk '{print $2}' /sys/fs/cgroup/cpu.max) + +if [ "$CPU_PERIOD" -ne 0 ]; then + CPU_NUMBER=$(echo "scale=2; $CPU_LIMIT / $CPU_PERIOD" | bc) + echo "Number of CPUs allocated: $CPU_NUMBER" + + # for HQ setting round to integer number of CPUs, the left are for system tasks + CPU_LIMIT=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc) +else + # if no limit (with local OCI without setting cpu limit, use all CPUs) + CPU_LIMIT=$(nproc) + echo "No CPU limit set" +fi + +# Start hq server with a worker +run-one-constantly hq server start 1>$HOME/.hq-stdout 2>$HOME/.hq-stderr & +run-one-constantly hq worker start --cpus=${CPU_LIMIT} --resource "mem=sum(${MEMORY_LIMIT})" --no-detect-resources & + +# Reset the default memory_per_machine and default_mpiprocs_per_machine +# c.set_default_mpiprocs_per_machine = ${CPU_LIMIT} +# c.set_default_memery_per_machine = ${MEMORY_LIMIT} + +# Same as original localhost set job poll interval to 2.0 secs +# In addition, set default mpiprocs and memor per machine +# TODO: this will be run every time the container start, we need a lock file to prevent it. +job_poll_interval="2.0" +computer_name=${HQ_COMPUTER} +python -c " +from aiida import load_profile; from aiida.orm import load_computer; +load_profile(); +load_computer('${computer_name}').set_minimum_job_poll_interval(${job_poll_interval}) +load_computer('${computer_name}').set_default_mpiprocs_per_machine(${CPU_LIMIT}) +load_computer('${computer_name}').set_default_memory_per_machine(${MEMORY_LIMIT}) +" diff --git a/src/aiidalab_qe/__main__.py b/src/aiidalab_qe/__main__.py index 1e5d5043a..2a81f56d0 100644 --- a/src/aiidalab_qe/__main__.py +++ b/src/aiidalab_qe/__main__.py @@ -16,19 +16,20 @@ def cli(): @cli.command() @click.option("-f", "--force", is_flag=True) +@click.option("--computer") @click.option("-p", "--profile", default=_DEFAULT_PROFILE) -def install_qe(force, profile): +def install_qe(force, profile, computer): from aiida import load_profile - from aiidalab_qe.setup.codes import codes_are_setup, install + from aiidalab_qe.setup.codes import codes_are_setup, install_and_setup load_profile(profile) try: - for msg in install(force=force): + for msg in install_and_setup(computer=computer, force=force): click.echo(msg) - assert codes_are_setup() + assert codes_are_setup(computer=computer) click.secho("Codes are setup!", fg="green") except Exception as error: - raise click.ClickException(f"Failed to set up QE failed: {error}") from error + raise click.ClickException(f"Failed to set up QE: {error}") from error @cli.command() diff --git a/src/aiidalab_qe/common/setup_codes.py b/src/aiidalab_qe/common/setup_codes.py index 32cae7ede..99aa4e5ec 100644 --- a/src/aiidalab_qe/common/setup_codes.py +++ b/src/aiidalab_qe/common/setup_codes.py @@ -4,7 +4,7 @@ import ipywidgets as ipw import traitlets -from ..setup.codes import QE_VERSION, install +from ..setup.codes import QE_VERSION, install_and_setup from .widgets import ProgressBar __all__ = [ @@ -66,7 +66,7 @@ def _refresh_installed(self): try: self.set_trait("busy", True) - for msg in install(): + for msg in install_and_setup(): self.set_message(msg) except Exception as error: diff --git a/src/aiidalab_qe/plugins/utils.py b/src/aiidalab_qe/plugins/utils.py index 8aa602ac5..a1d6ee841 100644 --- a/src/aiidalab_qe/plugins/utils.py +++ b/src/aiidalab_qe/plugins/utils.py @@ -3,12 +3,22 @@ def set_component_resources(component, code_info): """Set the resources for a given component based on the code info.""" - if code_info: # Ensure code_info is not None or empty - component.metadata.options.resources = { - "num_machines": code_info["nodes"], - "num_mpiprocs_per_machine": code_info["ntasks_per_node"], - "num_cores_per_mpiproc": code_info["cpus_per_task"], - } + if code_info: # Ensure code_info is not None or empty (# XXX: ? from jyu, need to pop a warning to plugin developer or what?) + code: orm.Code = code_info["code"] + if code.computer.scheduler_type == "hyperqueue": + component.metadata.options.resources = { + "num_cpus": code_info["nodes"] + * code_info["ntasks_per_node"] + * code_info["cpus_per_task"] + } + else: + # XXX: jyu should properly deal with None type of scheduler_type which can be "core.direct" (will be replaced by hyperqueue) and "core.slurm" ... + component.metadata.options.resources = { + "num_machines": code_info["nodes"], + "num_mpiprocs_per_machine": code_info["ntasks_per_node"], + "num_cores_per_mpiproc": code_info["cpus_per_task"], + } + component.metadata.options["max_wallclock_seconds"] = code_info[ "max_wallclock_seconds" ] diff --git a/src/aiidalab_qe/setup/codes.py b/src/aiidalab_qe/setup/codes.py index 21eba487e..c63a76bb7 100644 --- a/src/aiidalab_qe/setup/codes.py +++ b/src/aiidalab_qe/setup/codes.py @@ -1,13 +1,14 @@ +import subprocess from pathlib import Path from shutil import which -from subprocess import CalledProcessError, run from filelock import FileLock, Timeout from aiida.common.exceptions import NotExistent from aiida.orm import load_code -FN_LOCKFILE = Path.home().joinpath(".install-qe-on-localhost.lock") +FN_INSTALL_LOCKFILE = Path.home().joinpath(".install-qe-on-localhost.lock") +FN_SETUP_LOCKFILE = Path.home().joinpath(".setup-qe-on-localhost.lock") FN_DO_NOT_SETUP = Path.cwd().joinpath(".do-not-setup-on-localhost") QE_VERSION = "7.2" @@ -43,11 +44,22 @@ def get_qe_env(): def qe_installed(): - return get_qe_env().exists() + import json + + env_exist = get_qe_env().exists() + proc = subprocess.run( + ["conda", "list", "-n", f"{get_qe_env().name}", "--json", "--full-name", "qe"], + check=True, + capture_output=True, + ) + + info = json.loads(str(proc.stdout.decode()))[0] + + return env_exist and "qe" == info["name"] def install_qe(): - run( + subprocess.run( [ "conda", "create", @@ -64,17 +76,17 @@ def install_qe(): ) -def _code_is_setup(name): +def _code_is_setup(name, computer): try: - load_code(f"{name}-{QE_VERSION}@localhost") + load_code(f"{name}-{QE_VERSION}@{computer}") except NotExistent: return False else: return True -def codes_are_setup(): - return all(_code_is_setup(code_name) for code_name in CODE_NAMES) +def codes_are_setup(computer): + return all(_code_is_setup(code_name, computer) for code_name in CODE_NAMES) def _generate_header_to_setup_code(): @@ -89,13 +101,13 @@ def _generate_header_to_setup_code(): return header_code -def _generate_string_to_setup_code(code_name, computer_name="localhost"): +def _generate_string_to_setup_code(code_name, computer): """Generate the Python string to setup an AiiDA code for a given computer. Tries to load an existing code and if not existent, generates Python code to create and store a new code setup.""" try: - load_code(f"{code_name}-{QE_VERSION}@{computer_name}") + load_code(f"{code_name}-{QE_VERSION}@{computer}") except NotExistent: label = f"{code_name}-{QE_VERSION}" description = f"{code_name}.x ({QE_VERSION}) setup by AiiDAlab." @@ -114,7 +126,7 @@ def _generate_string_to_setup_code(code_name, computer_name="localhost"): code.store() """.format( # noqa: UP032 - computer_name, + computer, label, description, filepath_executable, @@ -127,77 +139,109 @@ def _generate_string_to_setup_code(code_name, computer_name="localhost"): return "" -def setup_codes(): +def setup_codes(computer): python_code = _generate_header_to_setup_code() for code_name in CODE_NAMES: - python_code += _generate_string_to_setup_code(code_name) + python_code += _generate_string_to_setup_code(code_name, computer) try: - run(["python", "-c", python_code], capture_output=True, check=True) - except CalledProcessError as error: - raise RuntimeError(f"Failed to setup codes: {error}") from None + subprocess.run(["python", "-c", python_code], capture_output=True, check=True) + except subprocess.CalledProcessError as err: + raise RuntimeError( + f"Failed to setup codes, exit_code={err.returncode}, {err.stderr}" + ) from None -def install(force=False): +def install_and_setup(computer="localhost", force=False): """Install Quantum ESPRESSO and the corresponding AiiDA codes. Args: force: Ignore previously failed attempts and install anyways. + computer: computer label in AiiDA where the code is setup for """ # Check for "do not install file" and skip actual check. The purpose of # this file is to not re-try this process on every app start in case that # there are issues. + # XXX: use filelock to control `FN_DO_NOT_SETUP` as well if not force and FN_DO_NOT_SETUP.exists(): raise RuntimeError("Installation failed in previous attempt.") + yield from _install() + yield from _setup(computer) + + +def _install(): + """Install Quantum ESPRESSO.""" yield "Checking installation status..." conda_installed = which("conda") try: - with FileLock(FN_LOCKFILE, timeout=5): - # We assume that if the codes are already setup, everything is in - # order. Only if they are not present, should we take action, - # however we only do so if the environment has a conda binary - # present (`which conda`). If that is not the case then we assume - # that this is a custom user environment in which case we also take - # no further action. - if codes_are_setup(): - return # Already setup - + with FileLock(FN_INSTALL_LOCKFILE, timeout=5): if not conda_installed: raise RuntimeError( "Unable to automatically install Quantum ESPRESSO, conda " "is not available." ) + if qe_installed(): + return + + # Install Quantum ESPRESSO. + yield "Installing QE..." + try: + install_qe() + except subprocess.CalledProcessError as error: + raise RuntimeError( + f"Failed to create conda environment: {error}" + ) from None + + except Timeout: + # Assume that the installation was triggered by a different process. + yield "Installation was already started, waiting for it to finish..." + with FileLock(FN_INSTALL_LOCKFILE, timeout=120): if not qe_installed(): - # First, install Quantum ESPRESSO. - yield "Installing QE..." - try: - install_qe() - except CalledProcessError as error: - raise RuntimeError( - f"Failed to create conda environment: {error}" - ) from None + raise RuntimeError( + "Installation process did not finish in the expected time." + ) from None + + +def _setup(computer): + """Setup the corresponding AiiDA codes after QE installation.""" + yield "Checking setup status..." + + try: + with FileLock(FN_SETUP_LOCKFILE, timeout=5): + # We assume that if the codes are already setup, everything is in + # order. Only if they are not present, should we take action, + # however we only do so if the environment has a conda binary + # present (`which conda`). If that is not the case then we assume + # that this is a custom user environment in which case we also take + # no further action. + if codes_are_setup(computer=computer): + return # Already setup # After installing QE, we install the corresponding # AiiDA codes: python_code = _generate_header_to_setup_code() for code_name in CODE_NAMES: - if not _code_is_setup(code_name): - yield f"Preparing setup script for ({code_name})..." - code_string = _generate_string_to_setup_code(code_name) + if not _code_is_setup(code_name, computer=computer): + yield f"Preparing setup script for ({code_name}) on ({computer})..." + code_string = _generate_string_to_setup_code(code_name, computer) python_code += code_string try: yield "Setting up all codes..." - run(["python", "-c", python_code], capture_output=True, check=True) - except CalledProcessError as error: - raise RuntimeError(f"Failed to setup codes: {error}") from None + subprocess.run( + ["python", "-c", python_code], capture_output=True, check=True + ) + except subprocess.CalledProcessError as err: + raise RuntimeError( + f"Failed to setup codes, exit_code={err.returncode}, {err.stderr}" + ) from None except Timeout: # Assume that the installation was triggered by a different process. yield "Installation was already started, waiting for it to finish..." - with FileLock(FN_LOCKFILE, timeout=120): - if not codes_are_setup(): + with FileLock(FN_SETUP_LOCKFILE, timeout=120): + if not codes_are_setup(computer=computer): raise RuntimeError( "Installation process did not finish in the expected time." ) from None