Merge pull request #362 from bghira/main

deepfloyd, docker, multi-res validations, aspect ratio rounding knob
bghira · Apr 22, 2024 · bfe0112 · bfe0112
2 parents ccd21d8 + b7bcfb9
commit bfe0112
Show file tree

Hide file tree

Showing 33 changed files with 1,323 additions and 238 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,82 @@
+# SimpleTuner needs CU118
+FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
+
+# /workspace is the default volume for Runpod & other hosts
+WORKDIR /workspace
+
+# Update apt-get
+RUN apt-get update -y
+
+# Prevents different commands from being stuck by waiting
+# on user input during build
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install openssh & git
+RUN apt-get install -y --no-install-recommends openssh-server \
+                                               openssh-client \
+                                               git \
+                                               git-lfs
+
+# Installl misc unix libraries
+RUN apt-get install -y wget \
+                       curl \
+                       tmux \
+                       tldr \
+                       nvtop \
+                       vim \
+                       rsync \
+                       net-tools \
+                       less \
+                       iputils-ping \
+                       7zip \
+                       zip \
+                       unzip \
+                       htop \
+                       inotify-tools
+
+# Set up git to support LFS, and to store credentials; useful for Huggingface Hub
+RUN git config --global credential.helper store && \
+    git lfs install
+
+# Install Python VENV
+RUN apt-get install -y python3.10-venv
+
+# Ensure SSH access. Not needed for Runpod but is required on Vast and other Docker hosts
+EXPOSE 22/tcp
+
+# Install misc Python & CUDA Libraries
+RUN apt-get update -y && apt-get install -y python3 python3-pip libcudnn8 libcudnn8-dev
+RUN python3 -m pip install pip --upgrade
+
+# HF
+ARG HUGGING_FACE_HUB_TOKEN
+ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
+ENV HF_HOME=/workspace/huggingface
+
+RUN pip3 install "huggingface_hub[cli]"
+
+RUN huggingface-cli login --token "$HUGGING_FACE_HUB_TOKEN" --add-to-git-credential
+
+# WanDB
+ARG WANDB_TOKEN
+ENV WANDB_TOKEN=$WANDB_TOKEN
+
+RUN pip3 install wandb
+
+RUN wandb login "$WANDB_TOKEN"
+
+# Clone SimpleTuner
+RUN git clone https://github.com/bghira/SimpleTuner --branch release
+# RUN git clone https://github.com/bghira/SimpleTuner --branch main # Uncomment to use latest (possibly unstable) version
+
+# Install SimpleTuner
+RUN pip3 install poetry
+RUN cd SimpleTuner && python3 -m venv .venv && poetry install --no-root
+RUN chmod +x SimpleTuner/train_sdxl.sh
+RUN chmod +x SimpleTuner/train_sd2x.sh
+
+# Copy start script with exec permissions
+COPY --chmod=755 docker-start.sh /start.sh
+
+# Dummy entrypoint
+ENTRYPOINT [ "/start.sh" ]
diff --git a/INSTALL.md b/INSTALL.md
@@ -1,5 +1,7 @@
 ## Setup
 
+For users that wish to make use of Docker or another container orchestration platform, see [this document](/documentation/DOCKER.md) first.
+
 1. Clone the repository and install the dependencies:
 
 ```bash

diff --git a/OPTIONS.md b/OPTIONS.md
@@ -166,9 +166,13 @@ This guide provides a user-friendly breakdown of the command-line options availa
 This is a basic overview meant to help you get started. For a complete list of options and more detailed explanations, please refer to the full specification:
 
 ```
-usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] [--model_type {full,lora}]
-                     [--lora_type {Standard}] [--lora_rank LORA_RANK]
-                     [--lora_alpha LORA_ALPHA] [--lora_dropout LORA_DROPOUT]
+usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] [--use_soft_min_snr]
+                     [--soft_min_snr_sigma_data SOFT_MIN_SNR_SIGMA_DATA]
+                     [--model_type {full,lora,deepfloyd-full,deepfloyd-lora,deepfloyd-stage2,deepfloyd-stage2-lora}]
+                     [--lora_type {Standard}]
+                     [--lora_init_type {default,gaussian,loftq}]
+                     [--lora_rank LORA_RANK] [--lora_alpha LORA_ALPHA]
+                     [--lora_dropout LORA_DROPOUT]
                      --pretrained_model_name_or_path
                      PRETRAINED_MODEL_NAME_OR_PATH
                      [--pretrained_vae_model_name_or_path PRETRAINED_VAE_MODEL_NAME_OR_PATH]
@@ -204,10 +208,12 @@ usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] [--model_type {full,lora}]
                      [--seed_for_each_device SEED_FOR_EACH_DEVICE]
                      [--resolution RESOLUTION]
                      [--resolution_type {pixel,area}]
+                     [--aspect_bucket_rounding {1,2,3,4,5,6,7,8,9}]
                      [--minimum_image_size MINIMUM_IMAGE_SIZE]
                      [--maximum_image_size MAXIMUM_IMAGE_SIZE]
                      [--target_downsample_size TARGET_DOWNSAMPLE_SIZE]
                      [--train_text_encoder]
+                     [--tokenizer_max_length TOKENIZER_MAX_LENGTH]
                      [--train_batch_size TRAIN_BATCH_SIZE]
                      [--num_train_epochs NUM_TRAIN_EPOCHS]
                      [--max_train_steps MAX_TRAIN_STEPS]
@@ -252,6 +258,8 @@ usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] [--model_type {full,lora}]
                      [--validation_negative_prompt VALIDATION_NEGATIVE_PROMPT]
                      [--num_validation_images NUM_VALIDATION_IMAGES]
                      [--validation_steps VALIDATION_STEPS]
+                     [--num_eval_images NUM_EVAL_IMAGES]
+                     [--eval_dataset_id EVAL_DATASET_ID]
                      [--validation_num_inference_steps VALIDATION_NUM_INFERENCE_STEPS]
                      [--validation_resolution VALIDATION_RESOLUTION]
                      [--validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc}]
@@ -291,7 +299,14 @@ options:
                         SNR weighting gamma to be used if rebalancing the
                         loss. Recommended value is 5.0. More details here:
                         https://arxiv.org/abs/2303.09556.
-  --model_type {full,lora}
+  --use_soft_min_snr    If set, will use the soft min SNR calculation method.
+                        This method uses the sigma_data parameter. If not
+                        provided, the method will raise an error.
+  --soft_min_snr_sigma_data SOFT_MIN_SNR_SIGMA_DATA
+                        The standard deviation of the data used in the soft
+                        min weighting method. This is required when using the
+                        soft min SNR calculation method.
+  --model_type {full,lora,deepfloyd-full,deepfloyd-lora,deepfloyd-stage2,deepfloyd-stage2-lora}
                         The training type to use. 'full' will train the full
                         model, while 'lora' will train the LoRA model. LoRA is
                         a smaller model that can be used for faster training.
@@ -300,6 +315,16 @@ options:
                         a different type of LoRA to train here. Currently,
                         only 'Standard' type is supported. This option exists
                         for compatibility with Kohya configuration files.
+  --lora_init_type {default,gaussian,loftq}
+                        The initialization type for the LoRA model. 'default'
+                        will use Microsoft's initialization method, 'gaussian'
+                        will use a Gaussian scaled distribution, and 'loftq'
+                        will use LoftQ initialization. In short experiments,
+                        'default' produced accurate results earlier in
+                        training, 'gaussian' had slightly more creative
+                        outputs, and LoftQ produces an entirely different
+                        result with worse quality at first, taking potentially
+                        longer to converge than the other methods.
   --lora_rank LORA_RANK
                         The dimension of the LoRA update matrices.
   --lora_alpha LORA_ALPHA
@@ -518,6 +543,13 @@ options:
                         resized to the resolution by pixel edge. If 'area',
                         the images will be resized so the pixel area is this
                         many megapixels.
+  --aspect_bucket_rounding {1,2,3,4,5,6,7,8,9}
+                        The number of decimal places to round the aspect ratio
+                        to. This is used to create buckets for aspect ratios.
+                        For higher precision, ensure the image sizes remain
+                        compatible. Higher precision levels result in a
+                        greater number of buckets, which may not be a
+                        desirable outcome.
   --minimum_image_size MINIMUM_IMAGE_SIZE
                         The minimum resolution for both sides of input images.
                         If --delete_unwanted_images is set, images smaller
@@ -545,6 +577,9 @@ options:
                         cropping to 1 megapixel.
   --train_text_encoder  (SD 2.x only) Whether to train the text encoder. If
                         set, the text encoder should be float32 precision.
+  --tokenizer_max_length TOKENIZER_MAX_LENGTH
+                        The maximum length of the tokenizer. If not set, will
+                        default to the tokenizer's max length.
   --train_batch_size TRAIN_BATCH_SIZE
                         Batch size (per device) for the training dataloader.
   --num_train_epochs NUM_TRAIN_EPOCHS
@@ -658,7 +693,10 @@ options:
   --adam_bfloat16       Whether or not to use stochastic bf16 in Adam.
                         Currently the only supported optimizer.
   --max_grad_norm MAX_GRAD_NORM
-                        Max gradient norm.
+                        Clipping the max gradient norm can help prevent
+                        exploding gradients, but may also harm training by
+                        introducing artifacts or making it hard to train
+                        artifacts away.
   --push_to_hub         Whether or not to push the model to the Hub.
   --hub_token HUB_TOKEN
                         The token to use to push to the Model Hub. Do not use
@@ -719,6 +757,15 @@ options:
                         running the prompt `args.validation_prompt` multiple
                         times: `args.num_validation_images` and logging the
                         images.
+  --num_eval_images NUM_EVAL_IMAGES
+                        If possible, this many eval images will be selected
+                        from each dataset. This is used when training super-
+                        resolution models such as DeepFloyd Stage II, which
+                        will upscale input images from the training set.
+  --eval_dataset_id EVAL_DATASET_ID
+                        When provided, only this dataset's images will be used
+                        as the eval set, to keep the training and eval images
+                        split.
   --validation_num_inference_steps VALIDATION_NUM_INFERENCE_STEPS
                         The default scheduler, DDIM, benefits from more steps.
                         UniPC can do well with just 10-15. For more speed

diff --git a/README.md b/README.md
@@ -47,6 +47,7 @@ For memory-constrained systems, see the [DeepSpeed document](/documentation/DEEP
 - Optional EMA (Exponential moving average) weight network to counteract model overfitting and improve training stability. **Note:** This does not apply to LoRA.
 - Support for a variety of image sizes and aspect ratios, enabling widescreen and portrait training on SDXL and SD 2.x.
 - Train directly from an S3-compatible storage provider, eliminating the requirement for expensive local storage. (Tested with Cloudflare R2 and Wasabi S3)
+- DeepFloyd stage I and II full u-net or parameter-efficient fine-tuning via LoRA using 22G VRAM
 
 ### Stable Diffusion 2.0/2.1
 

diff --git a/TUTORIAL.md b/TUTORIAL.md
@@ -221,6 +221,8 @@ Here's a breakdown of what each environment variable does:
   - Optionally, a user prompt library or the built-in prompt library may be used to generate more than 84 images on each checkpoint across a large number of concepts.
   - See `--user_prompt_library` for more information.
 
+  For DeepFloyd, a page is maintained with specific options to set. Visit [this document](/documentation/DEEPFLOYD.md) for a head start.
+
 #### Data Locations
 
 - `BASE_DIR`, `INSTANCE_DIR`, `OUTPUT_DIR`: Directories for the training data, instance data, and output models.

diff --git a/docker-start.sh b/docker-start.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+# Export useful ENV variables, including all Runpod specific vars, to /etc/rp_environment
+# This file can then later be sourced in a login shell
+echo "Exporting environment variables..."
+printenv |
+	grep -E '^RUNPOD_|^PATH=|^HF_HOME=|^HUGGING_FACE_HUB_TOKEN=|^_=' |
+	sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >>/etc/rp_environment
+
+# Add it to Bash login script
+echo 'source /etc/rp_environment' >>~/.bashrc
+
+# Vast.ai uses $SSH_PUBLIC_KEY
+if [[ $SSH_PUBLIC_KEY ]]; then
+	PUBLIC_KEY="${SSH_PUBLIC_KEY}"
+fi
+
+# Runpod uses $PUBLIC_KEY
+if [[ $PUBLIC_KEY ]]; then
+	mkdir -p ~/.ssh
+	chmod 700 ~/.ssh
+	echo "${PUBLIC_KEY}" >>~/.ssh/authorized_keys
+	chmod 700 -R ~/.ssh
+fi
+
+# Start SSH server
+service ssh start
+
+# 🫡
+sleep infinity