aiidalab · unkcpz · Oct 17, 2024 · Aug 12, 2024 · Aug 12, 2024 · Aug 12, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -1,11 +1,13 @@
 # syntax=docker/dockerfile:1
-ARG FULL_STACK_VER=2024.1022
+ARG FULL_STACK_VER=2024.1023
 ARG UV_VER=0.4.7
 ARG QE_VER=7.2
 ARG QE_DIR=/opt/conda/envs/quantum-espresso-${QE_VER}
+ARG HQ_VER=0.19.0
 
 ARG UV_CACHE_DIR=/tmp/uv_cache
 ARG QE_APP_SRC=/tmp/quantum-espresso
+ARG HQ_COMPUTER="localhost-hq"
 
 FROM ghcr.io/astral-sh/uv:${UV_VER} AS uv
 
@@ -43,22 +45,44 @@ RUN --mount=from=uv,source=/uv,target=/bin/uv \
 
 # STAGE 3
 # - Prepare AiiDA profile and localhost computer
+# - Prepare hq computer using hyperqueue as scheduler
 # - Install QE codes and pseudopotentials
 # - Archive home folder
 FROM build_deps AS home_build
 ARG QE_DIR
+ARG HQ_VER
+ARG HQ_COMPUTER
+
+# Install hq binary
+RUN wget -c -O hq.tar.gz https://github.com/It4innovations/hyperqueue/releases/download/v${HQ_VER}/hq-v${HQ_VER}-linux-x64.tar.gz && \
+    tar xf hq.tar.gz -C /opt/conda/
+
 ENV PSEUDO_FOLDER=/tmp/pseudo
 RUN mkdir -p ${PSEUDO_FOLDER} && \
     python -m aiidalab_qe download-pseudos --dest ${PSEUDO_FOLDER}
 
+ENV UV_CONSTRAINT=${PIP_CONSTRAINT}
+# Install the aiida-hyperqueue
+# XXX: fix me after release aiida-hyperqueue
+RUN --mount=from=uv,source=/uv,target=/bin/uv \
+    --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \
+     uv pip install --system --strict --cache-dir=${UV_CACHE_DIR} \
+     "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue"
+
+COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/
+
+ENV HQ_COMPUTER=$HQ_COMPUTER
+
 # TODO: Remove PGSQL and daemon log files, and other unneeded files
 RUN --mount=from=qe_conda_env,source=${QE_DIR},target=${QE_DIR} \
     bash /usr/local/bin/before-notebook.d/20_start-postgresql.sh && \
     bash /usr/local/bin/before-notebook.d/40_prepare-aiida.sh && \
-    python -m aiidalab_qe install-qe && \
+    bash /usr/local/bin/before-notebook.d/42_setup-hq-computer.sh && \
+    python -m aiidalab_qe install-qe --computer ${HQ_COMPUTER} && \
     python -m aiidalab_qe install-pseudos --source ${PSEUDO_FOLDER} && \
     verdi daemon stop && \
     mamba run -n aiida-core-services pg_ctl stop && \
+    touch /home/${NB_USER}/.FLAG_HOME_INITIALIZED && \
     cd /home/${NB_USER} && tar -cf /opt/conda/home.tar .
 
 # STAGE 3 - Final stage
@@ -71,22 +95,31 @@ FROM ghcr.io/aiidalab/full-stack:${FULL_STACK_VER}
 ARG QE_DIR
 ARG QE_APP_SRC
 ARG UV_CACHE_DIR
+ARG HQ_COMPUTER
 USER ${NB_USER}
 
 WORKDIR /tmp
 # Install python dependencies
 # Use uv cache from the previous build step
+# # Install the aiida-hyperqueue
+# # XXX: fix me after release aiida-hyperqueue
 ENV UV_CONSTRAINT=${PIP_CONSTRAINT}
 RUN --mount=from=uv,source=/uv,target=/bin/uv \
     --mount=from=build_deps,source=${UV_CACHE_DIR},target=${UV_CACHE_DIR},rw \
     --mount=from=build_deps,source=${QE_APP_SRC},target=${QE_APP_SRC},rw \
-    uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC}
+    uv pip install --strict --system --compile-bytecode --cache-dir=${UV_CACHE_DIR} ${QE_APP_SRC} "aiida-hyperqueue@git+https://github.com/aiidateam/aiida-hyperqueue"
+
+# copy hq binary
+COPY --from=home_build /opt/conda/hq /usr/local/bin/
 
 COPY --from=qe_conda_env ${QE_DIR} ${QE_DIR}
 
 USER root
+
 COPY ./before-notebook.d/* /usr/local/bin/before-notebook.d/
 
+ENV HQ_COMPUTER=$HQ_COMPUTER
+
 # Remove content of $HOME
 # '-mindepth=1' ensures that we do not remove the home directory itself.
 RUN find /home/${NB_USER}/ -mindepth 1 -delete

diff --git a/before-notebook.d/00_untar_home.sh → before-notebook.d/00_untar-home.sh b/before-notebook.d/00_untar_home.sh → before-notebook.d/00_untar-home.sh
@@ -4,7 +4,7 @@ set -eux
 home="/home/${NB_USER}"
 
 # Untar home archive file to restore home directory if it is empty
-if [[ $(ls -A ${home} | wc -l) = "0" ]]; then
+if [ ! -e $home/.FLAG_HOME_INITIALIZED ]; then
   if [[ ! -f $HOME_TAR ]]; then
     echo "File $HOME_TAR does not exist!"
     exit 1
@@ -15,12 +15,20 @@ if [[ $(ls -A ${home} | wc -l) = "0" ]]; then
   fi
 
   echo "Extracting $HOME_TAR to $home"
+  # NOTE: a tar error when deployed to k8s but at the momment not cause any issue
+  # tar: .: Cannot utime: Operation not permitted
+  # tar: .: Cannot change mode to rwxr-s---: Operation not permitted
   tar -xf $HOME_TAR -C "$home"
-
-  echo "Copying directory '$QE_APP_FOLDER' to '$AIIDALAB_APPS'"
-  cp -r "$QE_APP_FOLDER" "$AIIDALAB_APPS"
 else
   echo "$home folder is not empty!"
   ls -lrta "$home"
 fi
+
+if [ -d $AIIDALAB_APPS/quantum-espresso ]; then
+  echo "Quantum ESPRESSO app does exist"
+else
+  echo "Copying directory '$QE_APP_FOLDER' to '$AIIDALAB_APPS'"
+  cp -r "$QE_APP_FOLDER" "$AIIDALAB_APPS"
+fi
+
 set +eux
diff --git a/before-notebook.d/42_setup-hq-computer.sh b/before-notebook.d/42_setup-hq-computer.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+set -x
+
+# computer
+verdi computer show ${HQ_COMPUTER} || verdi computer setup        \
+  --non-interactive                                               \
+  --label "${HQ_COMPUTER}"                                        \
+  --description "local computer with hyperqueue scheduler"        \
+  --hostname "localhost"                                          \
+  --transport core.local                                          \
+  --scheduler hyperqueue                                          \
+  --work-dir /home/${NB_USER}/aiida_run/                          \
+  --mpirun-command "mpirun -np {num_cpus}"
+
+verdi computer configure core.local "${HQ_COMPUTER}"              \
+  --non-interactive                                               \
+  --safe-interval 5.0
+
+# disable the localhost which is set in base image
+verdi computer disable localhost aiida@localhost
diff --git a/before-notebook.d/43_start-hq.sh b/before-notebook.d/43_start-hq.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+set -x
+
+# NOTE: this cgroup folder hierachy is based on cgroupv2
+# if the container is open in system which has cgroupv1 the image build procedure will fail.
+# Since the image is mostly for demo server where we know the machine and OS I supposed
+# it should have cgroupv2 (> Kubernetes v1.25).
+# We only build the server for demo server so it does not require user to have new cgroup.
+# But for developers, please update your cgroup version to v2.
+# See: https://kubernetes.io/docs/concepts/architecture/cgroups/#using-cgroupv2
+
+# computer memory from runtime
+MEMORY_LIMIT=$(cat /sys/fs/cgroup/memory.max)
+
+if [ "$MEMORY_LIMIT" = "max" ]; then
+  MEMORY_LIMIT=4096
+  echo "No memory limit set, use 4GiB"
+else
+  MEMORY_LIMIT=$(echo "scale=0; $MEMORY_LIMIT / (1024 * 1024)" | bc)
+  echo "Memory Limit: ${MEMORY_LIMIT} MiB"
+fi
+
+# Compute number of cpus allocated to the container
+CPU_LIMIT=$(awk '{print $1}' /sys/fs/cgroup/cpu.max)
+CPU_PERIOD=$(awk '{print $2}' /sys/fs/cgroup/cpu.max)
+
+if [ "$CPU_PERIOD" -ne 0 ]; then
+  CPU_NUMBER=$(echo "scale=2; $CPU_LIMIT / $CPU_PERIOD" | bc)
+  echo "Number of CPUs allocated: $CPU_NUMBER"
+
+  # for HQ setting round to integer number of CPUs, the left are for system tasks
+  CPU_LIMIT=$(echo "scale=0; $CPU_LIMIT / $CPU_PERIOD" | bc)
+else
+  # if no limit (with local OCI without setting cpu limit, use all CPUs)
+  CPU_LIMIT=$(nproc)
+  echo "No CPU limit set"
+fi
+
+# Start hq server with a worker
+run-one-constantly hq server start 1>$HOME/.hq-stdout 2>$HOME/.hq-stderr &
+run-one-constantly hq worker start --cpus=${CPU_LIMIT} --resource "mem=sum(${MEMORY_LIMIT})" --no-detect-resources &
+
+# Reset the default memory_per_machine and default_mpiprocs_per_machine
+# c.set_default_mpiprocs_per_machine = ${CPU_LIMIT}
+# c.set_default_memery_per_machine = ${MEMORY_LIMIT}
+
+# Same as original localhost set job poll interval to 2.0 secs
+# In addition, set default mpiprocs and memor per machine
+# TODO: this will be run every time the container start, we need a lock file to prevent it.
+job_poll_interval="2.0"
+computer_name=${HQ_COMPUTER}
+python -c "
+from aiida import load_profile; from aiida.orm import load_computer;
+load_profile();
+load_computer('${computer_name}').set_minimum_job_poll_interval(${job_poll_interval})
+load_computer('${computer_name}').set_default_mpiprocs_per_machine(${CPU_LIMIT})
+load_computer('${computer_name}').set_default_memory_per_machine(${MEMORY_LIMIT})
+"
diff --git a/src/aiidalab_qe/__main__.py b/src/aiidalab_qe/__main__.py
@@ -16,19 +16,20 @@ def cli():
 
 @cli.command()
 @click.option("-f", "--force", is_flag=True)
+@click.option("--computer")
 @click.option("-p", "--profile", default=_DEFAULT_PROFILE)
-def install_qe(force, profile):
+def install_qe(force, profile, computer):
     from aiida import load_profile
-    from aiidalab_qe.setup.codes import codes_are_setup, install
+    from aiidalab_qe.setup.codes import codes_are_setup, install_and_setup
 
     load_profile(profile)
     try:
-        for msg in install(force=force):
+        for msg in install_and_setup(computer=computer, force=force):
             click.echo(msg)
-        assert codes_are_setup()
+        assert codes_are_setup(computer=computer)
         click.secho("Codes are setup!", fg="green")
     except Exception as error:
-        raise click.ClickException(f"Failed to set up QE failed: {error}") from error
+        raise click.ClickException(f"Failed to set up QE: {error}") from error
 
 
 @cli.command()

diff --git a/src/aiidalab_qe/common/setup_codes.py b/src/aiidalab_qe/common/setup_codes.py
@@ -4,7 +4,7 @@
 import ipywidgets as ipw
 import traitlets
 
-from ..setup.codes import QE_VERSION, install
+from ..setup.codes import QE_VERSION, install_and_setup
 from .widgets import ProgressBar
 
 __all__ = [
@@ -66,7 +66,7 @@ def _refresh_installed(self):
         try:
             self.set_trait("busy", True)
 
-            for msg in install():
+            for msg in install_and_setup():
                 self.set_message(msg)
 
         except Exception as error:

diff --git a/src/aiidalab_qe/plugins/utils.py b/src/aiidalab_qe/plugins/utils.py
@@ -3,12 +3,22 @@
 
 def set_component_resources(component, code_info):
     """Set the resources for a given component based on the code info."""
-    if code_info:  # Ensure code_info is not None or empty
-        component.metadata.options.resources = {
-            "num_machines": code_info["nodes"],
-            "num_mpiprocs_per_machine": code_info["ntasks_per_node"],
-            "num_cores_per_mpiproc": code_info["cpus_per_task"],
-        }
+    if code_info:  # Ensure code_info is not None or empty (# XXX: ? from jyu, need to pop a warning to plugin developer or what?)
+        code: orm.Code = code_info["code"]
+        if code.computer.scheduler_type == "hyperqueue":
+            component.metadata.options.resources = {
+                "num_cpus": code_info["nodes"]
+                * code_info["ntasks_per_node"]
+                * code_info["cpus_per_task"]
+            }
+        else:
+            # XXX: jyu should properly deal with None type of scheduler_type which can be "core.direct" (will be replaced by hyperqueue) and "core.slurm" ...
+            component.metadata.options.resources = {
+                "num_machines": code_info["nodes"],
+                "num_mpiprocs_per_machine": code_info["ntasks_per_node"],
+                "num_cores_per_mpiproc": code_info["cpus_per_task"],
+            }
+
         component.metadata.options["max_wallclock_seconds"] = code_info[
             "max_wallclock_seconds"
         ]