From 789ac9c8770751b1ff4936c87f9407cbd21556f8 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 1 Jul 2021 21:58:51 -0600 Subject: [PATCH 1/4] start of work to add demo containers the developer should easily be able to test spindle, and the user should be able to run a small example or tutorial. Ideally we can also extend a container to be able to build and test in CI Signed-off-by: vsoch --- docker/Dockerfile | 89 +++++++++++++++++ docker/Dockerfile.node | 3 + docker/README.md | 195 ++++++++++++++++++++++++++++++++++++++ docker/docker-compose.yml | 77 +++++++++++++++ docker/slurm.conf | 94 ++++++++++++++++++ docker/slurmdbd.conf | 37 ++++++++ 6 files changed, 495 insertions(+) create mode 100644 docker/Dockerfile create mode 100644 docker/Dockerfile.node create mode 100644 docker/README.md create mode 100644 docker/docker-compose.yml create mode 100644 docker/slurm.conf create mode 100644 docker/slurmdbd.conf diff --git a/docker/Dockerfile b/docker/Dockerfile new file mode 100644 index 0000000..de9df87 --- /dev/null +++ b/docker/Dockerfile @@ -0,0 +1,89 @@ +FROM centos:7 + +# docker build -t vanessa/slurm:20.11.8 . + +LABEL org.label-schema.vcs-url="https://github.com/hpc/spindle" \ + org.label-schema.docker.cmd="docker-compose up -d" \ + org.label-schema.name="spindle" \ + org.label-schema.description="Spindle with SLURM on Centos 7" \ + maintainer="Vanessa Sochat" + +ARG SLURM_TAG=slurm-20-11-8-1 + +RUN set -ex \ + && yum makecache fast \ + && yum -y update \ + && yum -y install epel-release \ + && yum -y install \ + wget \ + bzip2 \ + perl \ + gcc \ + gcc-c++\ + git \ + gnupg \ + make \ + munge \ + munge-devel \ + python-devel \ + python-pip \ + python3 \ + python3-devel \ + python3-pip \ + mariadb-server \ + mariadb-devel \ + psmisc \ + bash-completion \ + vim-enhanced \ + automake \ + && yum clean all \ + && rm -rf /var/cache/yum + +RUN pip install Cython nose && pip3 install Cython nose + +RUN set -x \ + && git clone https://github.com/SchedMD/slurm.git \ + && pushd slurm \ + && git checkout tags/$SLURM_TAG \ + && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ + --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ + && make install \ + && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ + && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ + && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \ + && install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \ + && popd \ + && rm -rf slurm \ + && groupadd -r --gid=995 slurm \ + && useradd -r -g slurm --uid=995 slurm \ + && mkdir /etc/sysconfig/slurm \ + /var/spool/slurmd \ + /var/run/slurmd \ + /var/run/slurmdbd \ + /var/lib/slurmd \ + /var/log/slurm \ + /data \ + && touch /var/lib/slurmd/node_state \ + /var/lib/slurmd/front_end_state \ + /var/lib/slurmd/job_state \ + /var/lib/slurmd/resv_state \ + /var/lib/slurmd/trigger_state \ + /var/lib/slurmd/assoc_mgr_state \ + /var/lib/slurmd/assoc_usage \ + /var/lib/slurmd/qos_usage \ + /var/lib/slurmd/fed_mgr_state \ + && chown -R slurm:slurm /var/*/slurm* \ + && /sbin/create-munge-key + +COPY slurm.conf /etc/slurm/slurm.conf +COPY slurmdbd.conf /etc/slurm/slurmdbd.conf + +COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh +ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] + +RUN yum install -y net-tools openssh-server openssh-clients singularity && \ + yum install -y epel-release centos-release-scl lsof sudo httpd24-mod_ssl httpd24-mod_ldap + +RUN groupadd spindle && \ + useradd --create-home --gid spindle spindle && \ + echo -n "spindle" | passwd --stdin spindle diff --git a/docker/Dockerfile.node b/docker/Dockerfile.node new file mode 100644 index 0000000..4988c15 --- /dev/null +++ b/docker/Dockerfile.node @@ -0,0 +1,3 @@ +FROM vanessa/slurm:18.08.6 + +# This container will be built on docker-compose up -d diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 0000000..f8197ae --- /dev/null +++ b/docker/README.md @@ -0,0 +1,195 @@ +# Spindle in Docker + +This directory contains a set of container recipes and scripts to allow you +to quickly bring up your own tiny cluster with [docker-compose](https://docs.docker.com/compose/install/), install +spindle, and give it a try. You will need both [docker-compose](https://docs.docker.com/compose/install/) +and [Docker](https://docs.docker.com/get-docker/) installed for this tutorial. + +## 1. Build Containers + +First, let's build a base container with slurm and centos with the [Dockerfile](Dockerfile) here: + +```bash +$ docker build -t vanessa/slurm:20.11.8 . +``` +Then building containers is as easy as: + +```bash +$ docker-compose build +``` + +And then bringing them up: + +```bash +$ docker-compose up -d +``` + +And checking that they are running + +```bash +$ docker-compose ps + Name Command State Ports +------------------------------------------------------------------------ +c1 /usr/local/bin/docker-entr ... Up 6818/tcp +c2 /usr/local/bin/docker-entr ... Up 6818/tcp +mysql docker-entrypoint.sh mysqld Up 3306/tcp, 33060/tcp +slurmctld /usr/local/bin/docker-entr ... Up 6817/tcp +slurmdbd /usr/local/bin/docker-entr ... Up 6819/tcp +``` + +Each of c1 and c2 are nodes for our cluster, and then slurmctld is like the login node. + +```bash +$ docker exec -it slurmctld bash +``` + +Try running a job! + +```bash +$ sbatch --wrap="sleep 20" +# squeue + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 1 normal wrap root R 0:00 1 c1 +``` + +## 2. Install spindle + +Now let's follow instructions to install spindle. + +```bash +$ git clone https://github.com/hpc/spindle +$ cd spindle +``` + +We want to install providing paths to munge and slurm. + +```bash +./configure --with-munge-dir=/etc/munge --enable-sec-munge --with-slurm-dir=/etc/slurm --enable-testsuite=no +make +make install +``` + +Note that we are disabling the test suite otherwise we'd get an install error not detecting +an MPI library. Now we can see spindle! + +``` +# spindle --help +Usage: spindle [OPTION...] mpi_command + + These options specify what types of files should be loaded through the Spindle + network + -a, --reloc-aout=yes|no Relocate the main executable through Spindle. + Default: yes + -f, --follow-fork=yes|no Relocate objects in fork'd child processes. + Default: yes + -l, --reloc-libs=yes|no Relocate shared libraries through Spindle. + Default: yes + -x, --reloc-exec=yes|no Relocate the targets of exec/execv/execve/... + calls. Default: yes + -y, --reloc-python=yes|no Relocate python modules (.py/.pyc) files when + loaded via python. Default: yes + + These options specify how the Spindle network should distibute files. Push is + better for SPMD programs. Pull is better for MPMD programs. Default is push. + -p, --push Use a push model where objects loaded by any + process are made available to all processes + -q, --pull Use a pull model where objects are only made + available to processes that require them + + These options configure Spindle's network model. Typical Spindle runs should + not need to set these. + -c, --cobo Use a tree-based cobo network for distributing + objects + -t, --port=port1-port2 TCP/IP port range for Spindle servers. Default: + 21940-21964 + + These options specify the security model Spindle should use for validating TCP + connections. Spindle will choose a default value if no option is specified. + --security-munge Use munge for security authentication + + These options specify the job launcher Spindle is being run with. If + unspecified, Spindle will try to autodetect. + --launcher-startup Launch spindle daemons using the system's job + launcher (requires an already set-up session). + --no-mpi Run serial jobs instead of MPI job + --openmpi MPI job is launched with the OpenMPI job jauncher. + + --slurm MPI job is launched with the srun job launcher. + --wreck MPI Job is launched with the wreck job launcher. + + Options for managing sessions, which can run multiple jobs out of one spindle + cache. + --end-session=session-id End a persistent Spindle session with the + given session-id + --run-in-session=session-id + Run a new job in the given session + --start-session Start a persistent Spindle session and print the + session-id to stdout + + Misc options + -b, --shmcache-size=size Size of client shared memory cache in kilobytes, + which can be used to improve performance if + multiple processes are running on each node. + Default: 0 + --cache-prefix=path Alias for python-prefix + --cleanup-proc=yes|no Fork a dedicated process to clean-up files + post-spindle. Useful for high-fault situations. + Default: no + -d, --debug=yes|no If yes, hide spindle from debuggers so they think + libraries come from the original locations. May + cause extra overhead. Default: yes + -e, --preload=FILE Provides a text file containing a white-space + separated list of files that should be relocated + to each node before execution begins + --enable-rsh=yes|no Enable startint daemons with an rsh tree, if the + startup mode supports it. Default: No + --hostbin=EXECUTABLE Path to a script that returns the hostlist for a + job on a cluster + -h, --no-hide Don't hide spindle file descriptors from + application + -k, --audit-type=subaudit|audit + Use the new-style subaudit interface for + intercepting ld.so, or the old-style audit + interface. The subaudit option reduces memory + overhead, but is more complex. Default is audit. + --msgcache-buffer=size Enables message buffering if size is non-zero, + otherwise sets the size of the buffer in + kilobytes + --msgcache-timeout=timeout Enables message buffering if size is + non-zero, otherwise sets the buffering timeout in + milliseconds + -n, --noclean=yes|no Don't remove local file cache after execution. + Default: no (removes the cache) + -o, --location=directory Back-end directory for storing relocated files. + Should be a non-shared location such as a ramdisk. + Default: $TMPDIR + --persist=yes|no Allow spindle servers to persist after the last + client job has exited. Default: No + -r, --python-prefix=path Colon-seperated list of directories that contain + the python install location + -s, --strip=yes|no Strip debug and symbol information from binaries + before distributing them. Default: yes + + -?, --help Give this help list + --usage Give a short usage message + -V, --version Print program version + +Mandatory or optional arguments to long options are also mandatory or optional +for any corresponding short options. + +Report bugs to legendre1@llnl.gov. +``` + +## 3. Use Spindle + +**TODO** we need a dummy example here + + +## 4. Clean Up + +When you are done, exit from the container, stop and remove your images: + +```bash +$ docker-compose stop +$ docker-compose rm +``` diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml new file mode 100644 index 0000000..9ea3ecb --- /dev/null +++ b/docker/docker-compose.yml @@ -0,0 +1,77 @@ +version: "2.2" + +services: + mysql: + image: mysql:5.7 + hostname: mysql + container_name: mysql + environment: + MYSQL_RANDOM_ROOT_PASSWORD: "yes" + MYSQL_DATABASE: slurm_acct_db + MYSQL_USER: slurm + MYSQL_PASSWORD: password + volumes: + - var_lib_mysql:/var/lib/mysql + + slurmdbd: + image: vanessa/slurm:18.08.6 + command: "slurmdbd" + container_name: slurmdbd + hostname: slurmdbd + volumes: + - etc_munge:/etc/munge + - etc_slurm:/etc/slurm + - var_log_slurm:/var/log/slurm + expose: + - "6819" + depends_on: + - mysql + + slurmctld: + image: vanessa/slurm:18.08.6 + command: "slurmctld" + container_name: slurmctld + hostname: slurmctld + volumes_from: + - slurmdbd + expose: + - "6817" + depends_on: + - "slurmdbd" + + c1: + build: + context: . + dockerfile: Dockerfile.node + command: "slurmd" + privileged: true + hostname: c1 + container_name: c1 + volumes_from: + - slurmctld + expose: + - "6818" + depends_on: + - "slurmctld" + + c2: + build: + context: . + dockerfile: Dockerfile.node + command: "slurmd" + privileged: true + hostname: c2 + container_name: c2 + volumes_from: + - slurmctld + expose: + - "6818" + depends_on: + - "slurmctld" + +volumes: + etc_munge: + etc_slurm: + slurm_jobdir: + var_lib_mysql: + var_log_slurm: diff --git a/docker/slurm.conf b/docker/slurm.conf new file mode 100644 index 0000000..5391519 --- /dev/null +++ b/docker/slurm.conf @@ -0,0 +1,94 @@ +# slurm.conf +# +# See the slurm.conf man page for more information. +# +ClusterName=linux +ControlMachine=slurmctld +ControlAddr=slurmctld +#BackupController= +#BackupAddr= +# +SlurmUser=slurm +#SlurmdUser=root +SlurmctldPort=6817 +SlurmdPort=6818 +AuthType=auth/munge +#JobCredentialPrivateKey= +#JobCredentialPublicCertificate= +StateSaveLocation=/var/lib/slurmd +SlurmdSpoolDir=/var/spool/slurmd +SwitchType=switch/none +MpiDefault=none +SlurmctldPidFile=/var/run/slurmd/slurmctld.pid +SlurmdPidFile=/var/run/slurmd/slurmd.pid +ProctrackType=proctrack/linuxproc +#PluginDir= +CacheGroups=0 +#FirstJobId= +ReturnToService=0 +#MaxJobCount= +#PlugStackConfig= +#PropagatePrioProcess= +#PropagateResourceLimits= +#PropagateResourceLimitsExcept= +#Prolog= +#Epilog= +#SrunProlog= +#SrunEpilog= +#TaskProlog= +#TaskEpilog= +#TaskPlugin= +#TrackWCKey=no +#TreeWidth=50 +#TmpFS= +#UsePAM= +# +# TIMERS +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 +# +# SCHEDULING +SchedulerType=sched/backfill +#SchedulerAuth= +#SchedulerPort= +#SchedulerRootFilter= +SelectType=select/cons_res +SelectTypeParameters=CR_CPU_Memory +FastSchedule=1 +#PriorityType=priority/multifactor +#PriorityDecayHalfLife=14-0 +#PriorityUsageResetPeriod=14-0 +#PriorityWeightFairshare=100000 +#PriorityWeightAge=1000 +#PriorityWeightPartition=10000 +#PriorityWeightJobSize=1000 +#PriorityMaxAge=1-0 +# +# LOGGING +SlurmctldDebug=3 +SlurmctldLogFile=/var/log/slurm/slurmctld.log +SlurmdDebug=3 +SlurmdLogFile=/var/log/slurm/slurmd.log +JobCompType=jobcomp/filetxt +JobCompLoc=/var/log/slurm/jobcomp.log +# +# ACCOUNTING +JobAcctGatherType=jobacct_gather/linux +JobAcctGatherFrequency=30 +# +AccountingStorageType=accounting_storage/slurmdbd +AccountingStorageHost=slurmdbd +AccountingStoragePort=6819 +AccountingStorageLoc=slurm_acct_db +#AccountingStoragePass= +#AccountingStorageUser= +# +# COMPUTE NODES +NodeName=c[1-2] RealMemory=1000 State=UNKNOWN +# +# PARTITIONS +PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP diff --git a/docker/slurmdbd.conf b/docker/slurmdbd.conf new file mode 100644 index 0000000..62d6fe2 --- /dev/null +++ b/docker/slurmdbd.conf @@ -0,0 +1,37 @@ +# +# Example slurmdbd.conf file. +# +# See the slurmdbd.conf man page for more information. +# +# Archive info +#ArchiveJobs=yes +#ArchiveDir="/tmp" +#ArchiveSteps=yes +#ArchiveScript= +#JobPurge=12 +#StepPurge=1 +# +# Authentication info +AuthType=auth/munge +#AuthInfo=/var/run/munge/munge.socket.2 +# +# slurmDBD info +DbdAddr=slurmdbd +DbdHost=slurmdbd +#DbdPort=6819 +SlurmUser=slurm +#MessageTimeout=300 +DebugLevel=4 +#DefaultQOS=normal,standby +LogFile=/var/log/slurm/slurmdbd.log +PidFile=/var/run/slurmdbd/slurmdbd.pid +#PluginDir=/usr/lib/slurm +#PrivateData=accounts,users,usage,jobs +#TrackWCKey=yes +# +# Database info +StorageType=accounting_storage/mysql +StorageHost=mysql +StorageUser=slurm +StoragePass=password +StorageLoc=slurm_acct_db From fd3d86ce3a5d28ada8035425ccfdcb7b6a3a234d Mon Sep 17 00:00:00 2001 From: vsoch Date: Tue, 24 Aug 2021 17:35:16 -0600 Subject: [PATCH 2/4] updating readme with instructions Signed-off-by: vsoch --- docker/Dockerfile | 88 ++--------- docker/Dockerfile.node | 3 - docker/README.md | 322 ++++++++++++++++++++++---------------- docker/docker-compose.yml | 14 +- docker/slurm.conf | 59 +++---- docker/slurmdbd.conf | 18 ++- 6 files changed, 246 insertions(+), 258 deletions(-) delete mode 100644 docker/Dockerfile.node diff --git a/docker/Dockerfile b/docker/Dockerfile index de9df87..75ab489 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,6 +1,4 @@ -FROM centos:7 - -# docker build -t vanessa/slurm:20.11.8 . +FROM vanessa/slurm:18.08.6 LABEL org.label-schema.vcs-url="https://github.com/hpc/spindle" \ org.label-schema.docker.cmd="docker-compose up -d" \ @@ -8,81 +6,21 @@ LABEL org.label-schema.vcs-url="https://github.com/hpc/spindle" \ org.label-schema.description="Spindle with SLURM on Centos 7" \ maintainer="Vanessa Sochat" -ARG SLURM_TAG=slurm-20-11-8-1 - -RUN set -ex \ - && yum makecache fast \ - && yum -y update \ - && yum -y install epel-release \ - && yum -y install \ - wget \ - bzip2 \ - perl \ - gcc \ - gcc-c++\ - git \ - gnupg \ - make \ - munge \ - munge-devel \ - python-devel \ - python-pip \ - python3 \ - python3-devel \ - python3-pip \ - mariadb-server \ - mariadb-devel \ - psmisc \ - bash-completion \ - vim-enhanced \ - automake \ - && yum clean all \ - && rm -rf /var/cache/yum - -RUN pip install Cython nose && pip3 install Cython nose - -RUN set -x \ - && git clone https://github.com/SchedMD/slurm.git \ - && pushd slurm \ - && git checkout tags/$SLURM_TAG \ - && ./configure --enable-debug --prefix=/usr --sysconfdir=/etc/slurm \ - --with-mysql_config=/usr/bin --libdir=/usr/lib64 \ - && make install \ - && install -D -m644 etc/cgroup.conf.example /etc/slurm/cgroup.conf.example \ - && install -D -m644 etc/slurm.conf.example /etc/slurm/slurm.conf.example \ - && install -D -m644 etc/slurmdbd.conf.example /etc/slurm/slurmdbd.conf.example \ - && install -D -m644 contribs/slurm_completion_help/slurm_completion.sh /etc/profile.d/slurm_completion.sh \ - && popd \ - && rm -rf slurm \ - && groupadd -r --gid=995 slurm \ - && useradd -r -g slurm --uid=995 slurm \ - && mkdir /etc/sysconfig/slurm \ - /var/spool/slurmd \ - /var/run/slurmd \ - /var/run/slurmdbd \ - /var/lib/slurmd \ - /var/log/slurm \ - /data \ - && touch /var/lib/slurmd/node_state \ - /var/lib/slurmd/front_end_state \ - /var/lib/slurmd/job_state \ - /var/lib/slurmd/resv_state \ - /var/lib/slurmd/trigger_state \ - /var/lib/slurmd/assoc_mgr_state \ - /var/lib/slurmd/assoc_usage \ - /var/lib/slurmd/qos_usage \ - /var/lib/slurmd/fed_mgr_state \ - && chown -R slurm:slurm /var/*/slurm* \ - && /sbin/create-munge-key +# Install ompi +RUN wget https://www.open-mpi.org/software/ompi/v1.10/downloads/openmpi-1.10.2.tar.gz && \ + tar -xzvf openmpi-1.10.2.tar.gz && cd openmpi-1.10.2/ && \ + ./configure --with-slurm --prefix="/home/$USER/.openmpi" && \ + make && make install -COPY slurm.conf /etc/slurm/slurm.conf -COPY slurmdbd.conf /etc/slurm/slurmdbd.conf +ENV PATH "$PATH:/home/$USER/.openmpi/bin" +ENV LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/home/$USER/.openmpi/lib/" -COPY docker-entrypoint.sh /usr/local/bin/docker-entrypoint.sh -ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"] +ENV TMPDIR /tmp -RUN yum install -y net-tools openssh-server openssh-clients singularity && \ - yum install -y epel-release centos-release-scl lsof sudo httpd24-mod_ssl httpd24-mod_ldap +RUN yum install -y automake && git clone https://github.com/hpc/spindle && \ + cd spindle && \ + ./configure --with-munge-dir=/etc/munge --enable-sec-munge --with-slurm-dir=/etc/slurm --with-testrm=slurm && \ + make && make install RUN groupadd spindle && \ useradd --create-home --gid spindle spindle && \ diff --git a/docker/Dockerfile.node b/docker/Dockerfile.node deleted file mode 100644 index 4988c15..0000000 --- a/docker/Dockerfile.node +++ /dev/null @@ -1,3 +0,0 @@ -FROM vanessa/slurm:18.08.6 - -# This container will be built on docker-compose up -d diff --git a/docker/README.md b/docker/README.md index f8197ae..962264f 100644 --- a/docker/README.md +++ b/docker/README.md @@ -7,12 +7,8 @@ and [Docker](https://docs.docker.com/get-docker/) installed for this tutorial. ## 1. Build Containers -First, let's build a base container with slurm and centos with the [Dockerfile](Dockerfile) here: - -```bash -$ docker build -t vanessa/slurm:20.11.8 . -``` -Then building containers is as easy as: +The [Dockerfile](Dockerfile) here is the base for building containers. +So running the build is as easy as: ```bash $ docker-compose build @@ -52,137 +48,197 @@ $ sbatch --wrap="sleep 20" 1 normal wrap root R 0:00 1 c1 ``` -## 2. Install spindle - -Now let's follow instructions to install spindle. - -```bash -$ git clone https://github.com/hpc/spindle -$ cd spindle -``` - -We want to install providing paths to munge and slurm. - -```bash -./configure --with-munge-dir=/etc/munge --enable-sec-munge --with-slurm-dir=/etc/slurm --enable-testsuite=no -make -make install -``` - -Note that we are disabling the test suite otherwise we'd get an install error not detecting -an MPI library. Now we can see spindle! - -``` -# spindle --help -Usage: spindle [OPTION...] mpi_command - - These options specify what types of files should be loaded through the Spindle - network - -a, --reloc-aout=yes|no Relocate the main executable through Spindle. - Default: yes - -f, --follow-fork=yes|no Relocate objects in fork'd child processes. - Default: yes - -l, --reloc-libs=yes|no Relocate shared libraries through Spindle. - Default: yes - -x, --reloc-exec=yes|no Relocate the targets of exec/execv/execve/... - calls. Default: yes - -y, --reloc-python=yes|no Relocate python modules (.py/.pyc) files when - loaded via python. Default: yes - - These options specify how the Spindle network should distibute files. Push is - better for SPMD programs. Pull is better for MPMD programs. Default is push. - -p, --push Use a push model where objects loaded by any - process are made available to all processes - -q, --pull Use a pull model where objects are only made - available to processes that require them - - These options configure Spindle's network model. Typical Spindle runs should - not need to set these. - -c, --cobo Use a tree-based cobo network for distributing - objects - -t, --port=port1-port2 TCP/IP port range for Spindle servers. Default: - 21940-21964 - - These options specify the security model Spindle should use for validating TCP - connections. Spindle will choose a default value if no option is specified. - --security-munge Use munge for security authentication - - These options specify the job launcher Spindle is being run with. If - unspecified, Spindle will try to autodetect. - --launcher-startup Launch spindle daemons using the system's job - launcher (requires an already set-up session). - --no-mpi Run serial jobs instead of MPI job - --openmpi MPI job is launched with the OpenMPI job jauncher. - - --slurm MPI job is launched with the srun job launcher. - --wreck MPI Job is launched with the wreck job launcher. - - Options for managing sessions, which can run multiple jobs out of one spindle - cache. - --end-session=session-id End a persistent Spindle session with the - given session-id - --run-in-session=session-id - Run a new job in the given session - --start-session Start a persistent Spindle session and print the - session-id to stdout - - Misc options - -b, --shmcache-size=size Size of client shared memory cache in kilobytes, - which can be used to improve performance if - multiple processes are running on each node. - Default: 0 - --cache-prefix=path Alias for python-prefix - --cleanup-proc=yes|no Fork a dedicated process to clean-up files - post-spindle. Useful for high-fault situations. - Default: no - -d, --debug=yes|no If yes, hide spindle from debuggers so they think - libraries come from the original locations. May - cause extra overhead. Default: yes - -e, --preload=FILE Provides a text file containing a white-space - separated list of files that should be relocated - to each node before execution begins - --enable-rsh=yes|no Enable startint daemons with an rsh tree, if the - startup mode supports it. Default: No - --hostbin=EXECUTABLE Path to a script that returns the hostlist for a - job on a cluster - -h, --no-hide Don't hide spindle file descriptors from - application - -k, --audit-type=subaudit|audit - Use the new-style subaudit interface for - intercepting ld.so, or the old-style audit - interface. The subaudit option reduces memory - overhead, but is more complex. Default is audit. - --msgcache-buffer=size Enables message buffering if size is non-zero, - otherwise sets the size of the buffer in - kilobytes - --msgcache-timeout=timeout Enables message buffering if size is - non-zero, otherwise sets the buffering timeout in - milliseconds - -n, --noclean=yes|no Don't remove local file cache after execution. - Default: no (removes the cache) - -o, --location=directory Back-end directory for storing relocated files. - Should be a non-shared location such as a ramdisk. - Default: $TMPDIR - --persist=yes|no Allow spindle servers to persist after the last - client job has exited. Default: No - -r, --python-prefix=path Colon-seperated list of directories that contain - the python install location - -s, --strip=yes|no Strip debug and symbol information from binaries - before distributing them. Default: yes - - -?, --help Give this help list - --usage Give a short usage message - -V, --version Print program version - -Mandatory or optional arguments to long options are also mandatory or optional -for any corresponding short options. - -Report bugs to legendre1@llnl.gov. +## 2. Interact with spindle + +Spindle should already be installed, and you can see the steps if you view the +[Dockerfile](Dockerfile). + +```bash +# which spindle +/usr/local/bin/spindle +``` + +You can try running a job first without spindle: + +```bash +$ srun -N 1 cat /proc/self/maps +``` + +If you try *with* spindle, this won't currently work: + +```bash +$ spindle srun -N 1 cat /proc/self/maps +``` + +So instead you can get an allocation first: + +```bash +$ salloc -N 1 +``` + +If you want to view the source code, go to /spindle. + +```bash +cd /spindle/testsuite +./runTests ``` ## 3. Use Spindle -**TODO** we need a dummy example here +The first sanity check to see if spindle is working is to look at this output: + +```bash +$ cat /proc/self/maps +[root@slurmctld /]# cat /proc/self/maps +00400000-0040b000 r-xp 00000000 00:7d 27450779 /usr/bin/cat +0060b000-0060c000 r--p 0000b000 00:7d 27450779 /usr/bin/cat +0060c000-0060d000 rw-p 0000c000 00:7d 27450779 /usr/bin/cat +0189d000-018be000 rw-p 00000000 00:00 0 [heap] +7f2204c09000-7f2204dcb000 r-xp 00000000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f2204dcb000-7f2204fcb000 ---p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f2204fcb000-7f2204fcf000 r--p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f2204fcf000-7f2204fd1000 rw-p 001c6000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f2204fd1000-7f2204fd6000 rw-p 00000000 00:00 0 +7f2204fd6000-7f2204ff8000 r-xp 00000000 00:7d 27466129 /usr/lib64/ld-2.17.so +7f2205064000-7f22051ed000 r--p 00000000 00:7d 27573565 /usr/lib/locale/locale-archive +7f22051ed000-7f22051f0000 rw-p 00000000 00:00 0 +7f22051f6000-7f22051f7000 rw-p 00000000 00:00 0 +7f22051f7000-7f22051f8000 r--p 00021000 00:7d 27466129 /usr/lib64/ld-2.17.so +7f22051f8000-7f22051f9000 rw-p 00022000 00:7d 27466129 /usr/lib64/ld-2.17.so +7f22051f9000-7f22051fa000 rw-p 00000000 00:00 0 +7fff660dd000-7fff660fe000 rw-p 00000000 00:00 0 [stack] +7fff661dc000-7fff661df000 r--p 00000000 00:00 0 [vvar] +7fff661df000-7fff661e0000 r-xp 00000000 00:00 0 [vdso] +ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall] +``` + +and compare that with the same command, but with spindle: + +```bash +$ spindle --no-mpi cat /proc/self/maps +00400000-0040b000 r-xp 00000000 00:7d 28646903 /tmp/spindle.84/usr/bin/1-spindlens-file-cat +0060b000-0060c000 r--p 0000b000 00:7d 27450779 /usr/bin/cat +0060c000-0060d000 rw-p 0000c000 00:7d 27450779 /usr/bin/cat +01dc8000-01de9000 rw-p 00000000 00:00 0 [heap] +7f8622f69000-7f86230f2000 r--p 00000000 00:7d 27573565 /usr/lib/locale/locale-archive +7f86230f2000-7f86232b4000 r-xp 00000000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f86232b4000-7f86234b4000 ---p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f86234b4000-7f86234b8000 r--p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f86234b8000-7f86234ba000 rw-p 001c6000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f86234ba000-7f86234bf000 rw-p 00000000 00:00 0 +7f86234bf000-7f8623681000 r-xp 00000000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f8623681000-7f8623881000 ---p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f8623881000-7f8623887000 rw-p 001c2000 00:7d 27466152 /usr/lib64/libc-2.17.so +7f8623887000-7f862388c000 rw-p 00000000 00:00 0 +7f862388c000-7f86238a9000 r-xp 00000000 00:7d 28646902 /tmp/spindle.84/usr/local/lib/spindle/0-spindlens-file-libspindle_audit_pipe.so +7f86238a9000-7f8623aa8000 ---p 0001d000 00:7d 28646902 /tmp/spindle.84/usr/local/lib/spindle/0-spindlens-file-libspindle_audit_pipe.so +7f8623aa8000-7f8623aaa000 rw-p 0001c000 00:7d 28646902 /tmp/spindle.84/usr/local/lib/spindle/0-spindlens-file-libspindle_audit_pipe.so +7f8623aaa000-7f8623aad000 rw-p 00000000 00:00 0 +7f8623aad000-7f8623acf000 r-xp 00000000 00:7d 27466129 /usr/lib64/ld-2.17.so +7f8623bc3000-7f8623cc5000 rw-p 00000000 00:00 0 +7f8623ccb000-7f8623cce000 rw-p 00000000 00:00 0 +7f8623cce000-7f8623cd0000 rw-p 00021000 00:7d 27466129 /usr/lib64/ld-2.17.so +7f8623cd0000-7f8623cd1000 rw-p 00000000 00:00 0 +7ffd73932000-7ffd73953000 rw-p 00000000 00:00 0 [stack] +7ffd739bd000-7ffd739c0000 r--p 00000000 00:00 0 [vvar] +7ffd739c0000-7ffd739c1000 r-xp 00000000 00:00 0 [vdso] +ffffffffff600000-ffffffffff601000 --xp 00000000 00:00 0 [vsyscall] +``` + +You should see several paths replaced with spindle ones. + +Next, let's try running a benchmarking tool with and without spindle. +This benchmark is called "Pynamic." Let's first clone it: + +```bash +$ git clone https://github.com/LLNL/pynamic +$ cd pynamic/pynamic-pyMPI-2.6a1 + +# Run python config_pynamic.py to see usage +``` +And then we would build shared libraries as follows. We are doing to decrease +from the default because it will take forever! + +```bash +# usage: config_pynamic.py [options] [-c ] +# example: config_pynamic.py 900 1250 -e -u 350 1250 -n 150 +# = total number of shared objects to produce +# = average number of functions per shared object +$ python config_pynamic.py 900 1250 -e -u 350 1250 -n 150 +``` + +Don't actually do that - it will never finish and control+C won't kill it! +Try this one instead, with a timer + +```bash +$ time python config_pynamic.py 30 1250 -e -u 350 1250 -n 150 + +************************************************ +summary of pynamic-sdb-pyMPI executable and 10 shared libraries +Size of aggregate total of shared libraries: 2.5MB +Size of aggregate texts of shared libraries: 6.8MB +Size of aggregate data of shared libraries: 408.4KB +Size of aggregate debug sections of shared libraries: 0B +Size of aggregate symbol tables of shared libraries: 0B +Size of aggregate string table size of shared libraries: 0B +************************************************ + +real 21m33.556s +user 14m54.538s +sys 3m31.206s +``` + +The above does take a bit (as you can see from the time) so let's try it now with +spindle: + +```bash +$ time spindle python config_pynamic.py 30 1250 -e -u 350 1250 -n 150 +``` + +**under development, not written yet, debugging things!** + +``` + 3.1 TO TEST + % python pynamic_driver.py `date +%s` + + or in a batchxterm: + + % srun pyMPI pynamic_driver.py `date +%s` + + % srun pynamic-pyMPI pynamic_driver.py `date +%s` + + % srun pynamic-sdb-pyMPI pynamic_driver.py `date +%s` + + % srun pynamic-bigexe pynamic_driver.py `date +%s` + + # note: Pynamic creates 3 executables: + # pyMPI - a vanilla pyMPI build + # pynamic-pyMPI - pyMPI with the generated .so's linked in + # pynamic-sdb-pyMPI - pyMPI with the generated libraries statically linked in + # and 2 optional executables (with the -b flag) + # pynamic-bigexe-pyMPI - a larger pyMPI with the generated .so's linked in + # pynamic-bigexe-sdb-pyMPI - a larger pyMPI with the generated libraries staically linked in + +-------------------------------------------------------- +4. CONTACTS + Greg Lee + Dong Ahn + Bronis de Supinski + John Gyllenhaal + +# run the pynamic benchmark with and without spindle +/cat/proc/self/maps + +prints out for each library loaded and bincat parts of address space takes up +run same command under spindle + +spindle --no-mpi cat /proc/self/maps + +to check if install works and is visible outside of spindle itself +/proc/pid/maps + +https://github.com/LLNL/pynamic +``` ## 4. Clean Up diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9ea3ecb..9a117c3 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -12,9 +12,10 @@ services: MYSQL_PASSWORD: password volumes: - var_lib_mysql:/var/lib/mysql - + slurmdbd: - image: vanessa/slurm:18.08.6 + build: + context: . command: "slurmdbd" container_name: slurmdbd hostname: slurmdbd @@ -22,13 +23,15 @@ services: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - var_log_slurm:/var/log/slurm + - tmp_slurm:/tmp expose: - "6819" depends_on: - mysql slurmctld: - image: vanessa/slurm:18.08.6 + build: + context: . command: "slurmctld" container_name: slurmctld hostname: slurmctld @@ -42,7 +45,6 @@ services: c1: build: context: . - dockerfile: Dockerfile.node command: "slurmd" privileged: true hostname: c1 @@ -57,7 +59,6 @@ services: c2: build: context: . - dockerfile: Dockerfile.node command: "slurmd" privileged: true hostname: c2 @@ -68,10 +69,11 @@ services: - "6818" depends_on: - "slurmctld" - + volumes: etc_munge: etc_slurm: slurm_jobdir: var_lib_mysql: var_log_slurm: + tmp_slurm: diff --git a/docker/slurm.conf b/docker/slurm.conf index 5391519..8686e23 100644 --- a/docker/slurm.conf +++ b/docker/slurm.conf @@ -1,10 +1,16 @@ -# slurm.conf +# +# Example slurm.conf file. Please run configurator.html +# (in doc/html) to build a configuration file customized +# for your environment. +# +# +# slurm.conf file generated by configurator.html. # # See the slurm.conf man page for more information. # ClusterName=linux -ControlMachine=slurmctld -ControlAddr=slurmctld +ControlMachine=linux0 +#ControlAddr= #BackupController= #BackupAddr= # @@ -15,15 +21,14 @@ SlurmdPort=6818 AuthType=auth/munge #JobCredentialPrivateKey= #JobCredentialPublicCertificate= -StateSaveLocation=/var/lib/slurmd -SlurmdSpoolDir=/var/spool/slurmd +StateSaveLocation=/var/spool/slurm/ctld +SlurmdSpoolDir=/var/spool/slurm/d SwitchType=switch/none MpiDefault=none -SlurmctldPidFile=/var/run/slurmd/slurmctld.pid -SlurmdPidFile=/var/run/slurmd/slurmd.pid -ProctrackType=proctrack/linuxproc +SlurmctldPidFile=/var/run/slurmctld.pid +SlurmdPidFile=/var/run/slurmd.pid +ProctrackType=proctrack/pgid #PluginDir= -CacheGroups=0 #FirstJobId= ReturnToService=0 #MaxJobCount= @@ -54,11 +59,8 @@ Waittime=0 # SCHEDULING SchedulerType=sched/backfill #SchedulerAuth= -#SchedulerPort= -#SchedulerRootFilter= -SelectType=select/cons_res -SelectTypeParameters=CR_CPU_Memory -FastSchedule=1 +SelectType=select/cons_tres +SelectTypeParameters=CR_Core #PriorityType=priority/multifactor #PriorityDecayHalfLife=14-0 #PriorityUsageResetPeriod=14-0 @@ -69,26 +71,17 @@ FastSchedule=1 #PriorityMaxAge=1-0 # # LOGGING -SlurmctldDebug=3 -SlurmctldLogFile=/var/log/slurm/slurmctld.log -SlurmdDebug=3 -SlurmdLogFile=/var/log/slurm/slurmd.log -JobCompType=jobcomp/filetxt -JobCompLoc=/var/log/slurm/jobcomp.log +SlurmctldDebug=info +SlurmctldLogFile=/var/log/slurmctld.log +SlurmdDebug=info +SlurmdLogFile=/var/log/slurmd.log +JobCompType=jobcomp/none +#JobCompLoc= # # ACCOUNTING -JobAcctGatherType=jobacct_gather/linux -JobAcctGatherFrequency=30 -# -AccountingStorageType=accounting_storage/slurmdbd -AccountingStorageHost=slurmdbd -AccountingStoragePort=6819 -AccountingStorageLoc=slurm_acct_db -#AccountingStoragePass= -#AccountingStorageUser= +#JobAcctGatherType=jobacct_gather/linux +#JobAcctGatherFrequency=30 # # COMPUTE NODES -NodeName=c[1-2] RealMemory=1000 State=UNKNOWN -# -# PARTITIONS -PartitionName=normal Default=yes Nodes=c[1-2] Priority=50 DefMemPerCPU=500 Shared=NO MaxNodes=1 MaxTime=5-00:00:00 DefaultTime=5-00:00:00 State=UP +NodeName=linux[1-32] Procs=1 State=UNKNOWN +PartitionName=debug Nodes=ALL Default=YES MaxTime=INFINITE State=UP diff --git a/docker/slurmdbd.conf b/docker/slurmdbd.conf index 62d6fe2..1e36286 100644 --- a/docker/slurmdbd.conf +++ b/docker/slurmdbd.conf @@ -16,22 +16,24 @@ AuthType=auth/munge #AuthInfo=/var/run/munge/munge.socket.2 # # slurmDBD info -DbdAddr=slurmdbd -DbdHost=slurmdbd -#DbdPort=6819 +DbdAddr=localhost +DbdHost=localhost +#DbdPort=7031 SlurmUser=slurm #MessageTimeout=300 -DebugLevel=4 +DebugLevel=verbose #DefaultQOS=normal,standby LogFile=/var/log/slurm/slurmdbd.log -PidFile=/var/run/slurmdbd/slurmdbd.pid +PidFile=/var/run/slurmdbd.pid #PluginDir=/usr/lib/slurm #PrivateData=accounts,users,usage,jobs #TrackWCKey=yes # # Database info StorageType=accounting_storage/mysql -StorageHost=mysql -StorageUser=slurm +#StorageHost=localhost +#StoragePort=1234 StoragePass=password -StorageLoc=slurm_acct_db +StorageUser=slurm +#StorageLoc=slurm_acct_db + From 2d4b5f0c1e8b947f19b471b3b5f2b72a09e623e0 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 21 Oct 2021 14:47:28 -0600 Subject: [PATCH 3/4] start of work to get ssh working... maybe Signed-off-by: vsoch --- docker/Dockerfile | 24 +++++++++++--- docker/README.md | 23 ++++++++++--- docker/docker-compose.yml | 11 +++++++ docker/docker-entrypoint.sh | 66 +++++++++++++++++++++++++++++++++++++ docker/sshd_config | 2 ++ 5 files changed, 116 insertions(+), 10 deletions(-) create mode 100755 docker/docker-entrypoint.sh create mode 100644 docker/sshd_config diff --git a/docker/Dockerfile b/docker/Dockerfile index 75ab489..a00f873 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -17,11 +17,25 @@ ENV LD_LIBRARY_PATH "$LD_LIBRARY_PATH:/home/$USER/.openmpi/lib/" ENV TMPDIR /tmp -RUN yum install -y automake && git clone https://github.com/hpc/spindle && \ +RUN yum install -y nc openssh-server openssh-clients iputils automake && git clone https://github.com/hpc/spindle && \ cd spindle && \ - ./configure --with-munge-dir=/etc/munge --enable-sec-munge --with-slurm-dir=/etc/slurm --with-testrm=slurm && \ +# --with-rsh-launch-option --with-testrm=slurm +# ./configure --with-munge-dir=/etc/munge --enable-sec-munge --with-slurm-dir=/etc/slurm --with-rsh-launch-option && \ + ./configure --with-testrm=slurm --with-rsh-launch --with-rsh-cmd=/usr/bin/ssh --with-tmpdir=/tmp && \ make && make install -RUN groupadd spindle && \ - useradd --create-home --gid spindle spindle && \ - echo -n "spindle" | passwd --stdin spindle +RUN mkdir -p /var/run/sshd && \ + chmod 0755 /var/run/sshd && \ + useradd -p spindle --create-home --shell /bin/bash spindle + +RUN ssh-keygen -b 2048 -t rsa -f /root/.ssh/id_rsa -q -N "" && \ + su spindle -c "ssh-keygen -b 2048 -t rsa -f /home/spindle/.ssh/id_rsa -q -N ''" && \ + cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys && \ + cat /home/spindle/.ssh/id_rsa.pub >> /home/spindle/.ssh/authorized_keys + +# Put spindle n debug mode, can be 1,2,3 (or unset) +ENV SPINDLE_DEBUG 3 +EXPOSE 22 + +COPY docker-entrypoint.sh /docker-entrypoint.sh +ENTRYPOINT ["/docker-entrypoint.sh"] diff --git a/docker/README.md b/docker/README.md index 962264f..49f23d2 100644 --- a/docker/README.md +++ b/docker/README.md @@ -38,7 +38,6 @@ Each of c1 and c2 are nodes for our cluster, and then slurmctld is like the logi ```bash $ docker exec -it slurmctld bash ``` - Try running a job! ```bash @@ -64,18 +63,24 @@ You can try running a job first without spindle: $ srun -N 1 cat /proc/self/maps ``` -If you try *with* spindle, this won't currently work: +If you try *with* spindle, you need to allocate a session first: ```bash -$ spindle srun -N 1 cat /proc/self/maps +$ salloc -N 1 +$ spindle --no-mpi cat /proc/self/maps ``` -So instead you can get an allocation first: +Notice that ssh works - which spindle should be installed to use. ```bash -$ salloc -N 1 +$ ssh c1 +exit + +$ ssh c2 +exit ``` +**NOTE** stopped here - this doesn't actually work. If you want to view the source code, go to /spindle. ```bash @@ -83,6 +88,14 @@ cd /spindle/testsuite ./runTests ``` +Optionally, you can better configure the cluster: + +```bash +sacctmgr -i create cluster spindle-cluster +sacctmgr add account spindle --immediate +sacctmgr create user spindle defaultaccount=spindle adminlevel=[None] --immediate +``` + ## 3. Use Spindle The first sanity check to see if spindle is working is to look at this output: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 9a117c3..9a74b28 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -19,19 +19,23 @@ services: command: "slurmdbd" container_name: slurmdbd hostname: slurmdbd + privileged: true volumes: - etc_munge:/etc/munge - etc_slurm:/etc/slurm - var_log_slurm:/var/log/slurm - tmp_slurm:/tmp + - ./sshd_config:/etc/ssh/sshd_config expose: - "6819" + - "22" depends_on: - mysql slurmctld: build: context: . + privileged: true command: "slurmctld" container_name: slurmctld hostname: slurmctld @@ -39,6 +43,7 @@ services: - slurmdbd expose: - "6817" + - "22" depends_on: - "slurmdbd" @@ -51,8 +56,11 @@ services: container_name: c1 volumes_from: - slurmctld + links: + - slurmctld expose: - "6818" + - "22" depends_on: - "slurmctld" @@ -63,10 +71,13 @@ services: privileged: true hostname: c2 container_name: c2 + links: + - slurmctld volumes_from: - slurmctld expose: - "6818" + - "22" depends_on: - "slurmctld" diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh new file mode 100755 index 0000000..006f042 --- /dev/null +++ b/docker/docker-entrypoint.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -e + +# Start ssh in all containers +systemctl enable sshd + +ssh-keygen -A +/usr/sbin/sshd -D & + +if [ "$1" = "slurmdbd" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Starting the Slurm Database Daemon (slurmdbd) ..." + + { + . /etc/slurm/slurmdbd.conf + until echo "SELECT 1" | mysql -h $StorageHost -u$StorageUser -p$StoragePass 2>&1 > /dev/null + do + echo "-- Waiting for database to become active ..." + sleep 2 + done + } + echo "-- Database is now active ..." + + exec gosu slurm /usr/sbin/slurmdbd -Dvvv +fi + +if [ "$1" = "slurmctld" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Waiting for slurmdbd to become active before starting slurmctld ..." + + until 2>/dev/null >/dev/tcp/slurmdbd/6819 + do + echo "-- slurmdbd is not available. Sleeping ..." + sleep 2 + done + echo "-- slurmdbd is now active ..." + + echo "---> Starting the Slurm Controller Daemon (slurmctld) ..." + exec gosu slurm /usr/sbin/slurmctld -Dvvv +fi + +if [ "$1" = "slurmd" ] +then + echo "---> Starting the MUNGE Authentication service (munged) ..." + gosu munge /usr/sbin/munged + + echo "---> Waiting for slurmctld to become active before starting slurmd..." + + until 2>/dev/null >/dev/tcp/slurmctld/6817 + do + echo "-- slurmctld is not available. Sleeping ..." + sleep 2 + done + echo "-- slurmctld is now active ..." + + echo "---> Starting the Slurm Node Daemon (slurmd) ..." + exec /usr/sbin/slurmd -Dvvv +fi + +exec "$@" diff --git a/docker/sshd_config b/docker/sshd_config new file mode 100644 index 0000000..7ca03bf --- /dev/null +++ b/docker/sshd_config @@ -0,0 +1,2 @@ +ChallengeResponseAuthentication no +PasswordAuthentication no From 70a30d25676c2cf7bcd49d7cebf7cf4c254e64a2 Mon Sep 17 00:00:00 2001 From: vsoch Date: Thu, 21 Oct 2021 15:14:32 -0600 Subject: [PATCH 4/4] fixing issue with TMPDIR not being defined this now reproduces up to the same issue with overlay, which does not work on my computer. But it might work on someone elses! Signed-off-by: vsoch --- docker/README.md | 9 ++++++++- docker/docker-entrypoint.sh | 3 +++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/docker/README.md b/docker/README.md index 49f23d2..2f45ccf 100644 --- a/docker/README.md +++ b/docker/README.md @@ -70,7 +70,7 @@ $ salloc -N 1 $ spindle --no-mpi cat /proc/self/maps ``` -Notice that ssh works - which spindle should be installed to use. +Before we run tests, let's get the hosts c1 and c2 added (it will ask you to confirm yes for each) ```bash $ ssh c1 @@ -88,6 +88,13 @@ cd /spindle/testsuite ./runTests ``` +If you need more debug output: + +```bash +# 1, 2, 3 +export SPINDLE_DEBUG=2 +``` + Optionally, you can better configure the cluster: ```bash diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh index 006f042..cccbfd5 100755 --- a/docker/docker-entrypoint.sh +++ b/docker/docker-entrypoint.sh @@ -4,6 +4,9 @@ set -e # Start ssh in all containers systemctl enable sshd +# Ensure TMPDIR envar is defined for all users, spindle needs it +echo "export TMPDIR=/tmp" >> /etc/profile.d/tmpdir.sh + ssh-keygen -A /usr/sbin/sshd -D &