Merge pull request #6 from qbic-pipelines/fix/qpb-review

Fix/qpb-review
qbic-pipelines · Feb 23, 2024 · a5623d9 · a5623d9
2 parents d7074ec + ec3b86e
commit a5623d9
Show file tree

Hide file tree

Showing 12 changed files with 179 additions and 36 deletions.
diff --git a/.github/workflows/publish_docker.yml b/.github/workflows/publish_docker.yml
@@ -25,8 +25,8 @@ jobs:
             -   name: Publish to Registry
                 uses: elgohr/Publish-Docker-Github-Action@master
                 with:
-                    name: waseju/root_tissue_segmentation
-                    username: waseju
+                    name: qbic-pipelines/root_tissue_segmentation
+                    username: luiskuhn
                     password: '${{ secrets.MLF_CORE_SYNC_TOKEN}}'
                     registry: ghcr.io
                     tags: "latest,1.0.0"
diff --git a/.github/workflows/push_cont_ghcr_rts.yml b/.github/workflows/push_cont_ghcr_rts.yml
@@ -0,0 +1,38 @@
+name: Docker push of root_tissue_segmentation to GHCR
+# This builds the docker image and pushes it to GHCR
+# Runs on qbic-pipelines repo releases and push event to 'dev' branch (PR merges)
+on:
+  push:
+    branches:
+      - dev
+  release:
+    types: [published]
+
+jobs:
+  push_github:
+    name: Push new root_tissue_segmentation image to GHCR
+    runs-on: ubuntu-latest
+    # Only run for the qbic-pipelines repo, for releases and merged PRs
+    if: ${{ github.repository == 'qbic-pipelines/root-tissue-segmentation-core' }}
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+
+      - name: Build new docker image
+        run: docker build --no-cache . -t ghcr.io/qbic-pipelines/root_tissue_segmentation:latest
+
+      - name: Log in to registry
+        # Update the personal access token to GITHUB_TOKEN
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $ --password-stdin
+
+      - name: Push image to GHCR (dev)
+        if: ${{ github.event_name == 'push' }}
+        run: |
+          docker tag ghcr.io/qbic-pipelines/root_tissue_segmentation:latest ghcr.io/qbic-pipelines/root_tissue_segmentation:dev
+          docker push ghcr.io/qbic-pipelines/root_tissue_segmentation:dev
+
+      - name: Push image to GHCR (release)
+        if: ${{ github.event_name == 'release' }}
+        run: |
+          docker tag ghcr.io/qbic-pipelines/root_tissue_segmentation:latest ghcr.io/qbic-pipelines/root_tissue_segmentation:${{ github.event.release.tag_name }}
+          docker push ghcr.io/qbic-pipelines/root_tissue_segmentation:${{ github.event.release.tag_name }}
diff --git a/.github/workflows/train_cpu.yml b/.github/workflows/train_cpu.yml
@@ -4,10 +4,10 @@ on: [ push, pull_request ]
 
 jobs:
     run:
-        runs-on: ubuntu-latest
+        runs-on: ubuntu-22.04
         strategy:
             matrix:
-                python: [ 3.7, 3.8 ]
+                python: [ 3.8 ]
 
         steps:
             -   name: Reclaim space
@@ -21,7 +21,7 @@ jobs:
                 uses: docker/build-push-action@v1
                 with:
                     dockerfile: Dockerfile
-                    repository: ghcr.io/waseju/root_tissue_segmentation
+                    repository: ghcr.io/qbic-pipelines/root_tissue_segmentation
                     tags: 1.0.0 # <<MLF-CORE_FORCE_BUMP>>
                     push: false
 
@@ -31,7 +31,7 @@ jobs:
                     python-version: ${{ matrix.python }}
 
             -   name: Install mlflow
-                run: pip install mlflow
+                run: pip install mlflow==2.10.2
 
             -   name: Train on the CPU
-                run: mlflow run . -P max_epochs=2
+                run: mlflow run . --build-image -P max_epochs=2 -P gpus=0
diff --git a/Dockerfile b/Dockerfile
@@ -1,7 +1,64 @@
-FROM mlfcore/base:1.2.0
+FROM nvidia/cuda:11.1.1-base-ubuntu20.04
+
+ENV NVIDIA_VISIBLE_DEVICES=all
+ENV NVIDIA_DRIVER_CAPABILITIES=all
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    wget \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    build-essential \
+    lshw \
+ && rm -rf /var/lib/apt/lists/*
+
+# Create a working directory and set it as default
+RUN mkdir /app
+RUN chmod 777 /app
+WORKDIR /app
+
+# Create a non-root user and switch to it
+RUN adduser --disabled-password --gecos '' --shell /bin/bash user 
+RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
+USER user
+
+# All users can use /home/user as their home directory
+ENV HOME=/home/user
+RUN chmod 777 /home/user
+
+ # Install Miniconda
+RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh \
+ && chmod +x ~/miniconda.sh \
+ && ~/miniconda.sh -b -p ~/miniconda \
+ && rm ~/miniconda.sh
+ENV PATH=/home/user/miniconda/bin:$PATH
+ENV CONDA_AUTO_UPDATE_CONDA=false
+
+# Update Conda
+RUN conda update conda
+
+# To get real time output we need to disable the stdout buffer
+ENV PYTHONUNBUFFERED 1
+
+# Enable colors
+ENV TERM xterm-256color
+
+######################################
 
 # Install the conda environment
+
+RUN sudo apt-key del 7fa2af80
+RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
+RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub
+
 RUN sudo apt-get update
+RUN sudo apt-get install -y libgl1
+RUN sudo DEBIAN_FRONTEND="noninteractive"  apt-get install -y libglib2.0-0
+
 RUN sudo DEBIAN_FRONTEND="noninteractive"  apt-get -y install tzdata
 RUN sudo apt-get install -y --reinstall openmpi-bin libopenmpi-dev
 COPY environment.yml .
@@ -16,3 +73,4 @@ RUN conda env export --name root-tissue-segmentation > root-tissue-segmentation_
 
 # Currently required, since mlflow writes every file as root!
 USER root
+
diff --git a/MLproject b/MLproject
@@ -8,9 +8,9 @@ name: root-tissue-segmentation
 
 #conda_env: environment.yml
 docker_env:
-  image: ghcr.io/waseju/root_tissue_segmentation:latest
+  image: ghcr.io/qbic-pipelines/root_tissue_segmentation:1.0.0
   volumes: ["${PWD}/data:/data"]
-  run_params: [['--gpus', 'all'],['--ipc','host']]
+  run_params: [['--gpus', 'all'], ['--ipc','host']]
 environment: [["MLF_CORE_DOCKER_RUN", "TRUE"],["CUBLAS_WORKSPACE_CONFIG", ":4096:8"]]
 
 entry_points:

diff --git a/README.rst b/README.rst
@@ -27,6 +27,11 @@ Deterministic deep-learning approach to segment microscopy images of root tissue
 
 The project includes automated hyperparameter optimization, using the Optuna framework (https://optuna.org/). Optimal hyperparameters are used as default values in this training module.
 
+Usage
+----------------
+
+Stable technical documentation for usage can be found in here: `Usage`_
+
 Activity Diagram
 ----------------
 
@@ -65,5 +70,6 @@ This package was created with `mlf-core`_ using cookiecutter_.
 .. _U-Net: https://lmb.informatik.uni-freiburg.de/Publications/2015/RFB15a/
 .. _U-Net++: https://arxiv.org/abs/1807.10165
 .. _U2-Net: https://arxiv.org/abs/2005.09007
+.. _Usage: https://github.com/qbic-pipelines/root-tissue-segmentation-core/blob/master/docs/usage.rst
 .. _mlf-core: https://mlf-core.readthedocs.io/en/latest/
 .. _cookiecutter: https://github.com/audreyr/cookiecutter
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -4,44 +4,58 @@ Usage
 Setup
 -------
 
-mlf-core based mlflow projects require either Conda or Docker to be installed.
-The usage of Docker is highly preferred, since it ensures that system-intelligence can fetch all required and accessible hardware.
-This cannot be guaranteed for MacOS let alone Windows environments.
+Projects based on mlf-core require either Conda or Docker to be installed, we recommend to install both. The usage of Docker is highly preferred to run the codebase, since it ensures that system-intelligence can fetch all required and accessible hardware. This cannot be guaranteed for MacOS let alone Windows environments.
 
 Conda
 +++++++
 
-There is no further setup required besides having Conda installed and CUDA configured for GPU support.
-mlflow will create a new environment for every run.
+It is required to have a Conda installed and CUDA configured for GPU support, mlflow will create a new environment for every run. Conda can be installed as instructed by the `Anaconda documentation <https://docs.anaconda.com/free/miniconda/>`_.
+
+CUDA Toolkit
+++++++++++++++
+
+CUDA can be installed as instructed by the `CUDA documentation <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#pre-installation-actions>`_. Please note the `Post install steps <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions>`_.
+
 
 Docker
 ++++++++
 
-If you use Docker you should not need to build the Docker container manually, since it should be available on Github Packages or another registry.
-However, if you want to build it manually for e.g. development purposes, ensure that the names matches the defined name in the ``MLproject``file.
-This is sufficient to train on the CPU. If you want to train using the GPU you need to have the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_ installed.
+If you use Docker you should not need to build the Docker container manually, since it should be available on Github Packages or another registry. However, if you want to build it manually for e.g. development purposes, ensure that the names matches the defined name in the ``MLproject`` file. Docker can be installed as instructed by the `Docker documentation <https://docs.docker.com/engine/install/>`_.
+
+This is sufficient to train on the CPU. If you want to train using the GPU you need to have the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-container-toolkit>`_ installed.
+
+
+Test Environment
+++++++++++++++++++
+
+This codebase has been tested in a virtual machine running **Ubuntu 22.04 LTS**. We installed **Miniconda** using the method suggested for Linux in the `Anaconda documentation <https://docs.anaconda.com/free/miniconda/#quick-command-line-install>`_. We installed **CUDA Toolkit 12.3 Update 2** for Ubuntu 22.04 using the instructions for the `network installation <https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network>`_. We installed Docker **Version 25.0.3 (build 4debf41)**, following the documentation for `Ubuntu <https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository>`_, please note the `Post install steps <https://docs.docker.com/engine/install/linux-postinstall/>`_. We followed the documentation to install the **NVIDIA Container Toolkit version 1.14.5** using `Apt <https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt>`_.
+
+We installed `Mlflow <https://mlflow.org/>`_ version **2.10.2** using the following conda command: ``conda install conda-forge::mlflow=2.10.2``
 
 Training
 -----------
 
+Please see the `mlflow documentation <https://www.mlflow.org/docs/latest/cli.html#mlflow-run>`_. Set your desired environment in the ``MLproject`` file. A simple training test can be conducted by specifying a limited number of epochs, this can be done via a parameter in the mlflow command, e.g. ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3``.
+
 Training on the CPU
 +++++++++++++++++++++++
 
-Set your desired environment in the MLproject file. Start training using ``mlflow run .``.
-No further parameters are required.
+Training with CPU can be achived by specifying zero GPUs in the command, i.e. ``mlflow run . --build-image -A runtime=nvidia -P gpus=0``. It is useful to restrict the number of epochs, e.g. ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=0``.
 
 Training using GPUs
 +++++++++++++++++++++++
 
-Regularly used commands: ``mlflow run . -A gpus=all``, ``mlflow run . -A gpus=all -P max_epochs=2``.
+Regularly used commands for development and testing:
+
+- ``mlflow run . --build-image -A runtime=nvidia``
+- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3``
+- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=1``
+- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=2``
 
-Please see the `mlflow documentation <https://www.mlflow.org/docs/latest/cli.html#mlflow-run>`_
+Conda environments will automatically use the GPU if available. Docker requires the accessible GPUs to be passed as runtime parameters. To train using all gpus run ``mlflow run . --build-image -A runtime=nvidia -P gpus=<<num_of_gpus>> -P acc=ddp``. 
 
-Conda environments will automatically use the GPU if available.
-Docker requires the accessible GPUs to be passed as runtime parameters. To train using all gpus run ``mlflow run . -A t-A gpus=all -P gpus=<<num_of_gpus>> -P acc=ddp``.
-To train only on CPU it is sufficient to call ``mlflow run . -A t``. To train on a single GPU, you can call ``mlflow run . -A t -A gpus=all -P gpus=1`` and for multiple GPUs (for example 2)
-``mlflow run . -A t -A gpus=all -P gpus=2 -P accelerator=ddp``.
-You can replace ``all`` with specific GPU ids (e.g. 0) if desired.
+To train on a single GPU, you can call ``mlflow run . --build-image -A runtime=nvidia -P gpus=1`` and for multiple GPUs (for example 2)
+``mlflow run . --build-image -A runtime=nvidia -P gpus=2 -P accelerator=ddp``.
 
 Hyperparameters and default values
 -----------

diff --git a/environment.yml b/environment.yml
@@ -5,18 +5,17 @@ channels:
   - pytorch
   - nvidia
 dependencies:
-  - nvidia::cudatoolkit=11.1.74
-  - defaults::python=3.9.2
+  - conda-forge::cudatoolkit=11.1.1
+  - defaults::python=3.8.3
   - defaults::boto3=1.17.20
   - conda-forge::tensorboardx=2.1
   - conda-forge::mlflow=1.14.1
   - conda-forge::rich=9.13.0
-  - pytorch::pytorch=1.8.1
-  - pytorch::torchvision=0.9.1
+  - pytorch::pytorch=1.8.1=py3.8_cuda11.1_cudnn8.0.5_0
+  - pytorch::torchvision=0.9.1=py38_cu111
   - pytorch-lightning=1.5.10
-  - pip=21.0.1
+  - pip=23.3.1
   - pip:
-    - pycuda==2020.1  # not on Conda
     - cloudpickle==1.6.0
     - system-intelligence==2.0.2
     - adabelief-pytorch==0.2.1
@@ -29,4 +28,6 @@ dependencies:
     - tifffile==2021.4.8
     - requests==2.25.1
     - torch-optimizer==0.3.0
-    - albumentations==1.1.0
+    - albumentations==1.3.1
+    - pandas==2.0.3
+
diff --git a/root_tissue_segmentation/data_loading/data_loader.py b/root_tissue_segmentation/data_loading/data_loader.py
@@ -1,6 +1,6 @@
 import pytorch_lightning as pl
 from albumentations import Compose
-from albumentations.augmentations import transforms
+from albumentations.augmentations.geometric import transforms
 from torch.utils.data import DataLoader
 
 from data_loading.phdfm import PHDFM

diff --git a/root_tissue_segmentation/data_loading/phdfm.py b/root_tissue_segmentation/data_loading/phdfm.py
@@ -213,7 +213,7 @@ def transform_files(self) -> (torch.FloatTensor, torch.IntTensor, pd.DataFrame):
                 weights)  # compute_class_weight('balanced', unique, masks.numpy().flatten())
             weights = pd.DataFrame({"class_ids": unique, "classes": classes, "weights": class_weights})
             weights['set_name'] = set_name
-            weight_df = weight_df.append(weights, ignore_index=True)
+            weight_df = pd.concat([weight_df, weights], ignore_index=True) # weight_df = weight_df.append(weights, ignore_index=True)
             tensor = torch.cat([imgs, masks], dim=3)
             tensors[set_name] = tensor
         print(wt_all)

diff --git a/root_tissue_segmentation/mlf_core/mlf_core.py b/root_tissue_segmentation/mlf_core/mlf_core.py
@@ -19,6 +19,21 @@ def __new__(cls):
             cls._instance = super(MLFCore, cls).__new__(cls)
         return cls._instance
 
+    @staticmethod
+    def set_deterministic_mode(general_seed, pytorch_seed, num_gpus):
+        os.environ['PYTHONHASHSEED'] = str(general_seed)  # Python general
+        np.random.seed(general_seed)  # Numpy random
+        random.seed(general_seed)  # Python random
+
+        torch.manual_seed(pytorch_seed)
+        torch.use_deterministic_algorithms(True)
+
+        if num_gpus > 0:
+            torch.cuda.manual_seed(pytorch_seed)
+            torch.cuda.manual_seed_all(pytorch_seed)  # For multiGPU
+            torch.backends.cudnn.deterministic = True
+            torch.backends.cudnn.benchmark = False # Disable
+
     @staticmethod
     def set_general_random_seeds(seed):
         os.environ['PYTHONHASHSEED'] = str(seed)  # Python general

diff --git a/root_tissue_segmentation/root_tissue_segmentation.py b/root_tissue_segmentation/root_tissue_segmentation.py
@@ -37,14 +37,18 @@
     mlflow.pytorch.autolog(log_models=False)
     # log conda env and system information
     MLFCore.log_sys_intel_conda_env()
+
     # parse cli arguments
     args = parser.parse_args()
     dict_args = vars(args)
+
     # store seed
     # number of gpus to make linter bit less restrict in terms of naming
     general_seed = dict_args['general_seed']
     pytorch_seed = dict_args['pytorch_seed']
     num_of_gpus = dict_args['gpus']
+
+    # setting deterministic mode, set random seeds
     MLFCore.set_general_random_seeds(general_seed)
     MLFCore.set_pytorch_random_seeds(pytorch_seed, num_of_gpus)
 
@@ -87,10 +91,17 @@
                                                 logger=TensorBoardLogger('/data'), auto_lr_find=False)
         tensorboard_output_path = f'data/default/version_{trainer.logger.version}'
 
+    # setting deterministic mode, cuda and pytorch settings
     trainer.deterministic = True
     trainer.benchmark = False
+
+    # setting deterministic mode, using additional mlf-core function to set mode 
+    MLFCore.set_deterministic_mode(general_seed, pytorch_seed, num_of_gpus)
+
+    # find lr
     # lrfind = trainer.tuner.lr_find(model,dm)
     # print(lrfind.suggestion())
+
     trainer.fit(model, dm)
     trainer.test(ckpt_path=checkpoint_callback.best_model_path,datamodule=dm)
     #trainer.save_checkpoint("/data/example.ckpt")