Skip to content

Commit

Permalink
Merge pull request #6 from qbic-pipelines/fix/qpb-review
Browse files Browse the repository at this point in the history
Fix/qpb-review
  • Loading branch information
luiskuhn authored Feb 23, 2024
2 parents d7074ec + ec3b86e commit a5623d9
Show file tree
Hide file tree
Showing 12 changed files with 179 additions and 36 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/publish_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ jobs:
- name: Publish to Registry
uses: elgohr/Publish-Docker-Github-Action@master
with:
name: waseju/root_tissue_segmentation
username: waseju
name: qbic-pipelines/root_tissue_segmentation
username: luiskuhn
password: '${{ secrets.MLF_CORE_SYNC_TOKEN}}'
registry: ghcr.io
tags: "latest,1.0.0"
38 changes: 38 additions & 0 deletions .github/workflows/push_cont_ghcr_rts.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: Docker push of root_tissue_segmentation to GHCR
# This builds the docker image and pushes it to GHCR
# Runs on qbic-pipelines repo releases and push event to 'dev' branch (PR merges)
on:
push:
branches:
- dev
release:
types: [published]

jobs:
push_github:
name: Push new root_tissue_segmentation image to GHCR
runs-on: ubuntu-latest
# Only run for the qbic-pipelines repo, for releases and merged PRs
if: ${{ github.repository == 'qbic-pipelines/root-tissue-segmentation-core' }}
steps:
- name: Check out pipeline code
uses: actions/checkout@v2

- name: Build new docker image
run: docker build --no-cache . -t ghcr.io/qbic-pipelines/root_tissue_segmentation:latest

- name: Log in to registry
# Update the personal access token to GITHUB_TOKEN
run: echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $ --password-stdin

- name: Push image to GHCR (dev)
if: ${{ github.event_name == 'push' }}
run: |
docker tag ghcr.io/qbic-pipelines/root_tissue_segmentation:latest ghcr.io/qbic-pipelines/root_tissue_segmentation:dev
docker push ghcr.io/qbic-pipelines/root_tissue_segmentation:dev
- name: Push image to GHCR (release)
if: ${{ github.event_name == 'release' }}
run: |
docker tag ghcr.io/qbic-pipelines/root_tissue_segmentation:latest ghcr.io/qbic-pipelines/root_tissue_segmentation:${{ github.event.release.tag_name }}
docker push ghcr.io/qbic-pipelines/root_tissue_segmentation:${{ github.event.release.tag_name }}
10 changes: 5 additions & 5 deletions .github/workflows/train_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ on: [ push, pull_request ]

jobs:
run:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
strategy:
matrix:
python: [ 3.7, 3.8 ]
python: [ 3.8 ]

steps:
- name: Reclaim space
Expand All @@ -21,7 +21,7 @@ jobs:
uses: docker/build-push-action@v1
with:
dockerfile: Dockerfile
repository: ghcr.io/waseju/root_tissue_segmentation
repository: ghcr.io/qbic-pipelines/root_tissue_segmentation
tags: 1.0.0 # <<MLF-CORE_FORCE_BUMP>>
push: false

Expand All @@ -31,7 +31,7 @@ jobs:
python-version: ${{ matrix.python }}

- name: Install mlflow
run: pip install mlflow
run: pip install mlflow==2.10.2

- name: Train on the CPU
run: mlflow run . -P max_epochs=2
run: mlflow run . --build-image -P max_epochs=2 -P gpus=0
60 changes: 59 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,64 @@
FROM mlfcore/base:1.2.0
FROM nvidia/cuda:11.1.1-base-ubuntu20.04

ENV NVIDIA_VISIBLE_DEVICES=all
ENV NVIDIA_DRIVER_CAPABILITIES=all

# Install some basic utilities
RUN apt-get update && apt-get install -y \
curl \
wget \
ca-certificates \
sudo \
git \
bzip2 \
libx11-6 \
build-essential \
lshw \
&& rm -rf /var/lib/apt/lists/*

# Create a working directory and set it as default
RUN mkdir /app
RUN chmod 777 /app
WORKDIR /app

# Create a non-root user and switch to it
RUN adduser --disabled-password --gecos '' --shell /bin/bash user
RUN echo "user ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-user
USER user

# All users can use /home/user as their home directory
ENV HOME=/home/user
RUN chmod 777 /home/user

# Install Miniconda
RUN wget -O ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-py38_4.9.2-Linux-x86_64.sh \
&& chmod +x ~/miniconda.sh \
&& ~/miniconda.sh -b -p ~/miniconda \
&& rm ~/miniconda.sh
ENV PATH=/home/user/miniconda/bin:$PATH
ENV CONDA_AUTO_UPDATE_CONDA=false

# Update Conda
RUN conda update conda

# To get real time output we need to disable the stdout buffer
ENV PYTHONUNBUFFERED 1

# Enable colors
ENV TERM xterm-256color

######################################

# Install the conda environment

RUN sudo apt-key del 7fa2af80
RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/3bf863cc.pub
RUN sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64/7fa2af80.pub

RUN sudo apt-get update
RUN sudo apt-get install -y libgl1
RUN sudo DEBIAN_FRONTEND="noninteractive" apt-get install -y libglib2.0-0

RUN sudo DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
RUN sudo apt-get install -y --reinstall openmpi-bin libopenmpi-dev
COPY environment.yml .
Expand All @@ -16,3 +73,4 @@ RUN conda env export --name root-tissue-segmentation > root-tissue-segmentation_

# Currently required, since mlflow writes every file as root!
USER root

4 changes: 2 additions & 2 deletions MLproject
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ name: root-tissue-segmentation

#conda_env: environment.yml
docker_env:
image: ghcr.io/waseju/root_tissue_segmentation:latest
image: ghcr.io/qbic-pipelines/root_tissue_segmentation:1.0.0
volumes: ["${PWD}/data:/data"]
run_params: [['--gpus', 'all'],['--ipc','host']]
run_params: [['--gpus', 'all'], ['--ipc','host']]
environment: [["MLF_CORE_DOCKER_RUN", "TRUE"],["CUBLAS_WORKSPACE_CONFIG", ":4096:8"]]

entry_points:
Expand Down
6 changes: 6 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@ Deterministic deep-learning approach to segment microscopy images of root tissue

The project includes automated hyperparameter optimization, using the Optuna framework (https://optuna.org/). Optimal hyperparameters are used as default values in this training module.

Usage
----------------

Stable technical documentation for usage can be found in here: `Usage`_

Activity Diagram
----------------

Expand Down Expand Up @@ -65,5 +70,6 @@ This package was created with `mlf-core`_ using cookiecutter_.
.. _U-Net: https://lmb.informatik.uni-freiburg.de/Publications/2015/RFB15a/
.. _U-Net++: https://arxiv.org/abs/1807.10165
.. _U2-Net: https://arxiv.org/abs/2005.09007
.. _Usage: https://github.com/qbic-pipelines/root-tissue-segmentation-core/blob/master/docs/usage.rst
.. _mlf-core: https://mlf-core.readthedocs.io/en/latest/
.. _cookiecutter: https://github.com/audreyr/cookiecutter
48 changes: 31 additions & 17 deletions docs/usage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,44 +4,58 @@ Usage
Setup
-------

mlf-core based mlflow projects require either Conda or Docker to be installed.
The usage of Docker is highly preferred, since it ensures that system-intelligence can fetch all required and accessible hardware.
This cannot be guaranteed for MacOS let alone Windows environments.
Projects based on mlf-core require either Conda or Docker to be installed, we recommend to install both. The usage of Docker is highly preferred to run the codebase, since it ensures that system-intelligence can fetch all required and accessible hardware. This cannot be guaranteed for MacOS let alone Windows environments.

Conda
+++++++

There is no further setup required besides having Conda installed and CUDA configured for GPU support.
mlflow will create a new environment for every run.
It is required to have a Conda installed and CUDA configured for GPU support, mlflow will create a new environment for every run. Conda can be installed as instructed by the `Anaconda documentation <https://docs.anaconda.com/free/miniconda/>`_.

CUDA Toolkit
++++++++++++++

CUDA can be installed as instructed by the `CUDA documentation <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#pre-installation-actions>`_. Please note the `Post install steps <https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#post-installation-actions>`_.


Docker
++++++++

If you use Docker you should not need to build the Docker container manually, since it should be available on Github Packages or another registry.
However, if you want to build it manually for e.g. development purposes, ensure that the names matches the defined name in the ``MLproject``file.
This is sufficient to train on the CPU. If you want to train using the GPU you need to have the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-docker>`_ installed.
If you use Docker you should not need to build the Docker container manually, since it should be available on Github Packages or another registry. However, if you want to build it manually for e.g. development purposes, ensure that the names matches the defined name in the ``MLproject`` file. Docker can be installed as instructed by the `Docker documentation <https://docs.docker.com/engine/install/>`_.

This is sufficient to train on the CPU. If you want to train using the GPU you need to have the `NVIDIA Container Toolkit <https://github.com/NVIDIA/nvidia-container-toolkit>`_ installed.


Test Environment
++++++++++++++++++

This codebase has been tested in a virtual machine running **Ubuntu 22.04 LTS**. We installed **Miniconda** using the method suggested for Linux in the `Anaconda documentation <https://docs.anaconda.com/free/miniconda/#quick-command-line-install>`_. We installed **CUDA Toolkit 12.3 Update 2** for Ubuntu 22.04 using the instructions for the `network installation <https://developer.nvidia.com/cuda-downloads?target_os=Linux&target_arch=x86_64&Distribution=Ubuntu&target_version=22.04&target_type=deb_network>`_. We installed Docker **Version 25.0.3 (build 4debf41)**, following the documentation for `Ubuntu <https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository>`_, please note the `Post install steps <https://docs.docker.com/engine/install/linux-postinstall/>`_. We followed the documentation to install the **NVIDIA Container Toolkit version 1.14.5** using `Apt <https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-apt>`_.

We installed `Mlflow <https://mlflow.org/>`_ version **2.10.2** using the following conda command: ``conda install conda-forge::mlflow=2.10.2``

Training
-----------

Please see the `mlflow documentation <https://www.mlflow.org/docs/latest/cli.html#mlflow-run>`_. Set your desired environment in the ``MLproject`` file. A simple training test can be conducted by specifying a limited number of epochs, this can be done via a parameter in the mlflow command, e.g. ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3``.

Training on the CPU
+++++++++++++++++++++++

Set your desired environment in the MLproject file. Start training using ``mlflow run .``.
No further parameters are required.
Training with CPU can be achived by specifying zero GPUs in the command, i.e. ``mlflow run . --build-image -A runtime=nvidia -P gpus=0``. It is useful to restrict the number of epochs, e.g. ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=0``.

Training using GPUs
+++++++++++++++++++++++

Regularly used commands: ``mlflow run . -A gpus=all``, ``mlflow run . -A gpus=all -P max_epochs=2``.
Regularly used commands for development and testing:

- ``mlflow run . --build-image -A runtime=nvidia``
- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3``
- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=1``
- ``mlflow run . --build-image -A runtime=nvidia -P max_epochs=3 -P gpus=2``

Please see the `mlflow documentation <https://www.mlflow.org/docs/latest/cli.html#mlflow-run>`_
Conda environments will automatically use the GPU if available. Docker requires the accessible GPUs to be passed as runtime parameters. To train using all gpus run ``mlflow run . --build-image -A runtime=nvidia -P gpus=<<num_of_gpus>> -P acc=ddp``.

Conda environments will automatically use the GPU if available.
Docker requires the accessible GPUs to be passed as runtime parameters. To train using all gpus run ``mlflow run . -A t-A gpus=all -P gpus=<<num_of_gpus>> -P acc=ddp``.
To train only on CPU it is sufficient to call ``mlflow run . -A t``. To train on a single GPU, you can call ``mlflow run . -A t -A gpus=all -P gpus=1`` and for multiple GPUs (for example 2)
``mlflow run . -A t -A gpus=all -P gpus=2 -P accelerator=ddp``.
You can replace ``all`` with specific GPU ids (e.g. 0) if desired.
To train on a single GPU, you can call ``mlflow run . --build-image -A runtime=nvidia -P gpus=1`` and for multiple GPUs (for example 2)
``mlflow run . --build-image -A runtime=nvidia -P gpus=2 -P accelerator=ddp``.

Hyperparameters and default values
-----------
Expand Down
15 changes: 8 additions & 7 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,17 @@ channels:
- pytorch
- nvidia
dependencies:
- nvidia::cudatoolkit=11.1.74
- defaults::python=3.9.2
- conda-forge::cudatoolkit=11.1.1
- defaults::python=3.8.3
- defaults::boto3=1.17.20
- conda-forge::tensorboardx=2.1
- conda-forge::mlflow=1.14.1
- conda-forge::rich=9.13.0
- pytorch::pytorch=1.8.1
- pytorch::torchvision=0.9.1
- pytorch::pytorch=1.8.1=py3.8_cuda11.1_cudnn8.0.5_0
- pytorch::torchvision=0.9.1=py38_cu111
- pytorch-lightning=1.5.10
- pip=21.0.1
- pip=23.3.1
- pip:
- pycuda==2020.1 # not on Conda
- cloudpickle==1.6.0
- system-intelligence==2.0.2
- adabelief-pytorch==0.2.1
Expand All @@ -29,4 +28,6 @@ dependencies:
- tifffile==2021.4.8
- requests==2.25.1
- torch-optimizer==0.3.0
- albumentations==1.1.0
- albumentations==1.3.1
- pandas==2.0.3

2 changes: 1 addition & 1 deletion root_tissue_segmentation/data_loading/data_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytorch_lightning as pl
from albumentations import Compose
from albumentations.augmentations import transforms
from albumentations.augmentations.geometric import transforms
from torch.utils.data import DataLoader

from data_loading.phdfm import PHDFM
Expand Down
2 changes: 1 addition & 1 deletion root_tissue_segmentation/data_loading/phdfm.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ def transform_files(self) -> (torch.FloatTensor, torch.IntTensor, pd.DataFrame):
weights) # compute_class_weight('balanced', unique, masks.numpy().flatten())
weights = pd.DataFrame({"class_ids": unique, "classes": classes, "weights": class_weights})
weights['set_name'] = set_name
weight_df = weight_df.append(weights, ignore_index=True)
weight_df = pd.concat([weight_df, weights], ignore_index=True) # weight_df = weight_df.append(weights, ignore_index=True)
tensor = torch.cat([imgs, masks], dim=3)
tensors[set_name] = tensor
print(wt_all)
Expand Down
15 changes: 15 additions & 0 deletions root_tissue_segmentation/mlf_core/mlf_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@ def __new__(cls):
cls._instance = super(MLFCore, cls).__new__(cls)
return cls._instance

@staticmethod
def set_deterministic_mode(general_seed, pytorch_seed, num_gpus):
os.environ['PYTHONHASHSEED'] = str(general_seed) # Python general
np.random.seed(general_seed) # Numpy random
random.seed(general_seed) # Python random

torch.manual_seed(pytorch_seed)
torch.use_deterministic_algorithms(True)

if num_gpus > 0:
torch.cuda.manual_seed(pytorch_seed)
torch.cuda.manual_seed_all(pytorch_seed) # For multiGPU
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False # Disable

@staticmethod
def set_general_random_seeds(seed):
os.environ['PYTHONHASHSEED'] = str(seed) # Python general
Expand Down
11 changes: 11 additions & 0 deletions root_tissue_segmentation/root_tissue_segmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,18 @@
mlflow.pytorch.autolog(log_models=False)
# log conda env and system information
MLFCore.log_sys_intel_conda_env()

# parse cli arguments
args = parser.parse_args()
dict_args = vars(args)

# store seed
# number of gpus to make linter bit less restrict in terms of naming
general_seed = dict_args['general_seed']
pytorch_seed = dict_args['pytorch_seed']
num_of_gpus = dict_args['gpus']

# setting deterministic mode, set random seeds
MLFCore.set_general_random_seeds(general_seed)
MLFCore.set_pytorch_random_seeds(pytorch_seed, num_of_gpus)

Expand Down Expand Up @@ -87,10 +91,17 @@
logger=TensorBoardLogger('/data'), auto_lr_find=False)
tensorboard_output_path = f'data/default/version_{trainer.logger.version}'

# setting deterministic mode, cuda and pytorch settings
trainer.deterministic = True
trainer.benchmark = False

# setting deterministic mode, using additional mlf-core function to set mode
MLFCore.set_deterministic_mode(general_seed, pytorch_seed, num_of_gpus)

# find lr
# lrfind = trainer.tuner.lr_find(model,dm)
# print(lrfind.suggestion())

trainer.fit(model, dm)
trainer.test(ckpt_path=checkpoint_callback.best_model_path,datamodule=dm)
#trainer.save_checkpoint("/data/example.ckpt")
Expand Down

0 comments on commit a5623d9

Please sign in to comment.