From 7df9a618c06e423ff73d371354efd61af380b925 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Thu, 12 Oct 2023 20:11:35 -0700
Subject: [PATCH 01/24] update OPTIONS

---
 OPTIONS.md | 152 ++++++++++++++++++++++++++++++++++-------------------
 1 file changed, 99 insertions(+), 53 deletions(-)

diff --git a/OPTIONS.md b/OPTIONS.md
index 5bdb0c9b..2341c710 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -130,24 +130,33 @@ This guide provides a user-friendly breakdown of the command-line options availa
 This is a basic overview meant to help you get started. For a complete list of options and more detailed explanations, please refer to the full specification:
 
 ```
-usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] --pretrained_model_name_or_path PRETRAINED_MODEL_NAME_OR_PATH [--pretrained_vae_model_name_or_path PRETRAINED_VAE_MODEL_NAME_OR_PATH] [--prediction_type {epsilon,v_prediction,sample}] [--snr_weight SNR_WEIGHT]
-                     [--training_scheduler_timestep_spacing {leading,linspace,trailing}] [--inference_scheduler_timestep_spacing {leading,linspace,trailing}] [--timestep_bias_strategy {earlier,later,none}] [--timestep_bias_multiplier TIMESTEP_BIAS_MULTIPLIER]
-                     [--timestep_bias_begin TIMESTEP_BIAS_BEGIN] [--timestep_bias_end TIMESTEP_BIAS_END] [--timestep_bias_portion TIMESTEP_BIAS_PORTION] [--rescale_betas_zero_snr] [--vae_dtype VAE_DTYPE] [--vae_batch_size VAE_BATCH_SIZE] [--keep_vae_loaded]
-                     [--skip_file_discovery SKIP_FILE_DISCOVERY] [--revision REVISION] [--tokenizer_name TOKENIZER_NAME] --instance_data_dir INSTANCE_DATA_DIR [--data_backend {local,aws}] [--write_batch_size WRITE_BATCH_SIZE] [--apply_dataset_padding] [--aws_config_file AWS_CONFIG_FILE]
-                     [--aws_bucket_name AWS_BUCKET_NAME] [--aws_endpoint_url AWS_ENDPOINT_URL] [--aws_region_name AWS_REGION_NAME] [--aws_access_key_id AWS_ACCESS_KEY_ID] [--aws_secret_access_key AWS_SECRET_ACCESS_KEY] [--cache_dir CACHE_DIR] [--dataset_name DATASET_NAME]
-                     [--dataset_config_name DATASET_CONFIG_NAME] [--image_column IMAGE_COLUMN] [--image_prompt_column IMAGE_PROMPT_COLUMN] [--seen_state_path SEEN_STATE_PATH] [--state_path STATE_PATH] [--caption_strategy {filename,textfile,instance_prompt}] [--instance_prompt INSTANCE_PROMPT]
-                     [--output_dir OUTPUT_DIR] [--seed SEED] [--seed_for_each_device] [--resolution RESOLUTION] [--resolution_type {pixel,area}] [--minimum_image_size MINIMUM_IMAGE_SIZE] [--crops_coords_top_left_h CROPS_COORDS_TOP_LEFT_H] [--crops_coords_top_left_w CROPS_COORDS_TOP_LEFT_W]
-                     [--center_crop] [--random_flip] [--train_text_encoder] [--train_batch_size TRAIN_BATCH_SIZE] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_train_samples MAX_TRAIN_SAMPLES] [--max_train_steps MAX_TRAIN_STEPS] [--checkpointing_steps CHECKPOINTING_STEPS]
-                     [--checkpoints_total_limit CHECKPOINTS_TOTAL_LIMIT] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--gradient_checkpointing] [--learning_rate LEARNING_RATE] [--scale_lr] [--lr_scheduler LR_SCHEDULER]
-                     [--lr_warmup_steps LR_WARMUP_STEPS] [--lr_num_cycles LR_NUM_CYCLES] [--lr_power LR_POWER] [--use_ema] [--non_ema_revision NON_EMA_REVISION] [--use_8bit_adam] [--use_adafactor_optimizer] [--use_dadapt_optimizer] [--dadaptation_learning_rate DADAPTATION_LEARNING_RATE]
-                     [--dataloader_num_workers DATALOADER_NUM_WORKERS] [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2] [--adam_weight_decay ADAM_WEIGHT_DECAY] [--adam_epsilon ADAM_EPSILON] [--max_grad_norm MAX_GRAD_NORM] [--push_to_hub] [--hub_token HUB_TOKEN]
-                     [--hub_model_id HUB_MODEL_ID] [--logging_dir LOGGING_DIR] [--allow_tf32] [--report_to REPORT_TO] [--track_luminance] [--tracker_run_name TRACKER_RUN_NAME] [--tracker_project_name TRACKER_PROJECT_NAME] [--validation_prompt VALIDATION_PROMPT] [--validation_prompt_library]
-                     [--user_prompt_library USER_PROMPT_LIBRARY] [--num_validation_images NUM_VALIDATION_IMAGES] [--validation_steps VALIDATION_STEPS] [--validation_num_inference_steps VALIDATION_NUM_INFERENCE_STEPS] [--validation_resolution VALIDATION_RESOLUTION]
-                     [--validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc}] [--enable_watermark] [--mixed_precision {no,fp16,bf16}] [--local_rank LOCAL_RANK] [--enable_xformers_memory_efficient_attention] [--set_grads_to_none] [--noise_offset NOISE_OFFSET]
-                     [--validation_epochs VALIDATION_EPOCHS] [--validation_guidance VALIDATION_GUIDANCE] [--validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE] [--validation_randomize] [--validation_seed VALIDATION_SEED] [--fully_unload_text_encoder]
-                     [--freeze_encoder_before FREEZE_ENCODER_BEFORE] [--freeze_encoder_after FREEZE_ENCODER_AFTER] [--freeze_encoder_strategy FREEZE_ENCODER_STRATEGY] [--print_filenames] [--debug_aspect_buckets] [--debug_dataset_loader] [--freeze_encoder]
-                     [--text_encoder_limit TEXT_ENCODER_LIMIT] [--prepend_instance_prompt] [--only_instance_prompt] [--caption_dropout_interval CAPTION_DROPOUT_INTERVAL] [--conditioning_dropout_probability CONDITIONING_DROPOUT_PROBABILITY]
-                     [--caption_dropout_probability CAPTION_DROPOUT_PROBABILITY] [--input_pertubation INPUT_PERTUBATION] [--use_original_images USE_ORIGINAL_IMAGES] [--delete_unwanted_images] [--delete_problematic_images] [--offset_noise] [--learning_rate_end LEARNING_RATE_END]
+usage: train_sdxl.py [-h] [--snr_gamma SNR_GAMMA] --pretrained_model_name_or_path PRETRAINED_MODEL_NAME_OR_PATH [--pretrained_vae_model_name_or_path PRETRAINED_VAE_MODEL_NAME_OR_PATH]
+                     [--prediction_type {epsilon,v_prediction,sample}] [--snr_weight SNR_WEIGHT] [--training_scheduler_timestep_spacing {leading,linspace,trailing}]
+                     [--inference_scheduler_timestep_spacing {leading,linspace,trailing}] [--timestep_bias_strategy {earlier,later,range,none}] [--timestep_bias_multiplier TIMESTEP_BIAS_MULTIPLIER]
+                     [--timestep_bias_begin TIMESTEP_BIAS_BEGIN] [--timestep_bias_end TIMESTEP_BIAS_END] [--timestep_bias_portion TIMESTEP_BIAS_PORTION] [--rescale_betas_zero_snr] [--vae_dtype VAE_DTYPE]
+                     [--vae_batch_size VAE_BATCH_SIZE] [--keep_vae_loaded] [--skip_file_discovery SKIP_FILE_DISCOVERY] [--revision REVISION] [--tokenizer_name TOKENIZER_NAME] --instance_data_dir INSTANCE_DATA_DIR
+                     [--cache_dir_text CACHE_DIR_TEXT] [--cache_dir_vae CACHE_DIR_VAE] [--data_backend {local,aws}] [--write_batch_size WRITE_BATCH_SIZE] [--apply_dataset_padding] [--aws_config_file AWS_CONFIG_FILE]
+                     [--aws_bucket_name AWS_BUCKET_NAME] [--aws_bucket_image_prefix AWS_BUCKET_IMAGE_PREFIX] [--aws_endpoint_url AWS_ENDPOINT_URL] [--aws_region_name AWS_REGION_NAME] [--aws_access_key_id AWS_ACCESS_KEY_ID]
+                     [--aws_secret_access_key AWS_SECRET_ACCESS_KEY] [--cache_dir CACHE_DIR] [--cache_clear_validation_prompts] [--dataset_name DATASET_NAME] [--dataset_config_name DATASET_CONFIG_NAME]
+                     [--image_column IMAGE_COLUMN] [--image_prompt_column IMAGE_PROMPT_COLUMN] [--seen_state_path SEEN_STATE_PATH] [--state_path STATE_PATH] [--caption_strategy {filename,textfile,instance_prompt}]
+                     [--instance_prompt INSTANCE_PROMPT] [--output_dir OUTPUT_DIR] [--seed SEED] [--seed_for_each_device SEED_FOR_EACH_DEVICE] [--resolution RESOLUTION] [--resolution_type {pixel,area}]
+                     [--minimum_image_size MINIMUM_IMAGE_SIZE] [--crops_coords_top_left_h CROPS_COORDS_TOP_LEFT_H] [--crops_coords_top_left_w CROPS_COORDS_TOP_LEFT_W] [--center_crop] [--random_flip] [--train_text_encoder]
+                     [--train_batch_size TRAIN_BATCH_SIZE] [--num_train_epochs NUM_TRAIN_EPOCHS] [--max_train_samples MAX_TRAIN_SAMPLES] [--max_train_steps MAX_TRAIN_STEPS] [--checkpointing_steps CHECKPOINTING_STEPS]
+                     [--checkpoints_total_limit CHECKPOINTS_TOTAL_LIMIT] [--resume_from_checkpoint RESUME_FROM_CHECKPOINT] [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS] [--gradient_checkpointing]
+                     [--learning_rate LEARNING_RATE] [--scale_lr] [--lr_scheduler LR_SCHEDULER] [--lr_warmup_steps LR_WARMUP_STEPS] [--lr_num_cycles LR_NUM_CYCLES] [--lr_power LR_POWER] [--use_ema]
+                     [--non_ema_revision NON_EMA_REVISION] [--offload_param_path OFFLOAD_PARAM_PATH] [--use_8bit_adam] [--use_adafactor_optimizer] [--use_dadapt_optimizer]
+                     [--dadaptation_learning_rate DADAPTATION_LEARNING_RATE] [--adam_beta1 ADAM_BETA1] [--adam_beta2 ADAM_BETA2] [--adam_weight_decay ADAM_WEIGHT_DECAY] [--adam_epsilon ADAM_EPSILON]
+                     [--max_grad_norm MAX_GRAD_NORM] [--push_to_hub] [--hub_token HUB_TOKEN] [--hub_model_id HUB_MODEL_ID] [--logging_dir LOGGING_DIR] [--validation_torch_compile VALIDATION_TORCH_COMPILE]
+                     [--validation_torch_compile_mode {reduce-overhead,default}] [--allow_tf32] [--report_to REPORT_TO] [--track_luminance] [--tracker_run_name TRACKER_RUN_NAME] [--tracker_project_name TRACKER_PROJECT_NAME]
+                     [--validation_prompt VALIDATION_PROMPT] [--validation_prompt_library] [--user_prompt_library USER_PROMPT_LIBRARY] [--num_validation_images NUM_VALIDATION_IMAGES] [--validation_steps VALIDATION_STEPS]
+                     [--validation_num_inference_steps VALIDATION_NUM_INFERENCE_STEPS] [--validation_resolution VALIDATION_RESOLUTION] [--validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc}] [--disable_compel]
+                     [--enable_watermark] [--mixed_precision {no,fp16,bf16}] [--local_rank LOCAL_RANK] [--enable_xformers_memory_efficient_attention] [--set_grads_to_none] [--noise_offset NOISE_OFFSET]
+                     [--validation_epochs VALIDATION_EPOCHS] [--validation_guidance VALIDATION_GUIDANCE] [--validation_guidance_rescale VALIDATION_GUIDANCE_RESCALE] [--validation_randomize]
+                     [--validation_seed VALIDATION_SEED] [--fully_unload_text_encoder] [--freeze_encoder_before FREEZE_ENCODER_BEFORE] [--freeze_encoder_after FREEZE_ENCODER_AFTER]
+                     [--freeze_encoder_strategy FREEZE_ENCODER_STRATEGY] [--print_filenames] [--debug_aspect_buckets] [--debug_dataset_loader] [--freeze_encoder] [--text_encoder_limit TEXT_ENCODER_LIMIT]
+                     [--prepend_instance_prompt] [--only_instance_prompt] [--caption_dropout_interval CAPTION_DROPOUT_INTERVAL] [--conditioning_dropout_probability CONDITIONING_DROPOUT_PROBABILITY]
+                     [--caption_dropout_probability CAPTION_DROPOUT_PROBABILITY] [--input_pertubation INPUT_PERTUBATION] [--use_original_images USE_ORIGINAL_IMAGES] [--delete_unwanted_images] [--delete_problematic_images]
+                     [--offset_noise] [--learning_rate_end LEARNING_RATE_END]
 
 The following SimpleTuner command-line options are available:
 
@@ -160,15 +169,18 @@ options:
   --pretrained_vae_model_name_or_path PRETRAINED_VAE_MODEL_NAME_OR_PATH
                         Path to an improved VAE to stabilize training. For more details check out: https://github.com/huggingface/diffusers/pull/4038.
   --prediction_type {epsilon,v_prediction,sample}
-                        The type of prediction to use for the u-net. Choose between ['epsilon', 'v_prediction', 'sample']. For SD 2.1-v, this is v_prediction. For 2.1-base, it is epsilon. SDXL is generally epsilon. SD 1.5 is epsilon.
+                        The type of prediction to use for the u-net. Choose between ['epsilon', 'v_prediction', 'sample']. For SD 2.1-v, this is v_prediction. For 2.1-base, it is epsilon. SDXL is generally epsilon. SD 1.5 is
+                        epsilon.
   --snr_weight SNR_WEIGHT
-                        When training a model using `--prediction_type=sample`, one can supply an SNR weight value to augment the loss with. If a value of 0.5 is provided here, the loss is taken half from the SNR and half from the MSE.
+                        When training a model using `--prediction_type=sample`, one can supply an SNR weight value to augment the loss with. If a value of 0.5 is provided here, the loss is taken half from the SNR and half
+                        from the MSE.
   --training_scheduler_timestep_spacing {leading,linspace,trailing}
                         Spacing timesteps can fundamentally alter the course of history. Er, I mean, your model weights. For all training, including epsilon, it would seem that 'trailing' is the right choice.
   --inference_scheduler_timestep_spacing {leading,linspace,trailing}
                         The Bytedance paper on zero terminal SNR recommends inference using 'trailing'.
-  --timestep_bias_strategy {earlier,later,none}
-                        The timestep bias strategy, which may help direct the model toward learning low or frequency details. Choices: ['earlier', 'later', 'none']. The default is 'none', which means no bias is applied, and training proceeds normally. The value of 'later' will prefer to generate samples for later timesteps.
+  --timestep_bias_strategy {earlier,later,range,none}
+                        The timestep bias strategy, which may help direct the model toward learning low or frequency details. Choices: ['earlier', 'later', 'none']. The default is 'none', which means no bias is applied, and
+                        training proceeds normally. The value of 'later' will prefer to generate samples for later timesteps.
   --timestep_bias_multiplier TIMESTEP_BIAS_MULTIPLIER
                         The multiplier for the bias. Defaults to 1.0, which means no bias is applied. A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it.
   --timestep_bias_begin TIMESTEP_BIAS_BEGIN
@@ -176,32 +188,45 @@ options:
   --timestep_bias_end TIMESTEP_BIAS_END
                         When using `--timestep_bias_strategy=range`, the final timestep to bias. Defaults to 1000, which is the number of timesteps that SDXL Base and SD 2.x were trained on.
   --timestep_bias_portion TIMESTEP_BIAS_PORTION
-                        The portion of timesteps to bias. Defaults to 0.25, which 25 percent of timesteps will be biased. A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines whether the biased portions are in the earlier or later timesteps.
+                        The portion of timesteps to bias. Defaults to 0.25, which 25 percent of timesteps will be biased. A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy`
+                        determines whether the biased portions are in the earlier or later timesteps.
   --rescale_betas_zero_snr
                         If set, will rescale the betas to zero terminal SNR. This is recommended for training with v_prediction. For epsilon, this might help with fine details, but will not result in contrast improvements.
   --vae_dtype VAE_DTYPE
                         The dtype of the VAE model. Choose between ['default', 'fp16', 'fp32', 'bf16'].The default VAE dtype is float32, due to NaN issues in SDXL 1.0.
   --vae_batch_size VAE_BATCH_SIZE
-                        When pre-caching latent vectors, this is the batch size to use. Decreasing this may help with VRAM issues, but if you are at that point of contention, it's possible that your GPU has too little RAM. Default: 4.
+                        When pre-caching latent vectors, this is the batch size to use. Decreasing this may help with VRAM issues, but if you are at that point of contention, it's possible that your GPU has too little RAM.
+                        Default: 4.
   --keep_vae_loaded     If set, will keep the VAE loaded in memory. This can reduce disk churn, but consumes VRAM during the forward pass.
   --skip_file_discovery SKIP_FILE_DISCOVERY
-                        Comma-separated values of which stages to skip discovery for. Skipping any stage will speed up resumption, but will increase the risk of errors, as missing images or incorrectly bucketed images may not be caught. 'vae' will skip the VAE cache process, 'aspect' will not build any aspect buckets, and 'text' will avoid text embed management. Valid options: aspect, vae, text.
+                        Comma-separated values of which stages to skip discovery for. Skipping any stage will speed up resumption, but will increase the risk of errors, as missing images or incorrectly bucketed images may
+                        not be caught. 'vae' will skip the VAE cache process, 'aspect' will not build any aspect buckets, and 'text' will avoid text embed management. Valid options: aspect, vae, text.
   --revision REVISION   Revision of pretrained model identifier from huggingface.co/models. Trainable model components should be float32 precision.
   --tokenizer_name TOKENIZER_NAME
                         Pretrained tokenizer name or path if not the same as model_name
   --instance_data_dir INSTANCE_DATA_DIR
-                        A folder containing the training data. Folder contents must either follow the structure described in the SimpleTuner documentation (https://github.com/bghira/SimpleTuner), or the structure described in https://huggingface.co/docs/datasets/image_dataset#imagefolder. For
-                        🤗 Datasets in particular, a `metadata.jsonl` file must exist to provide the captions for the images. For SimpleTuner layout, the images can be in subfolders. No particular config is required. Ignored if `dataset_name` is specified.
+                        A folder containing the training data. Folder contents must either follow the structure described in the SimpleTuner documentation (https://github.com/bghira/SimpleTuner), or the structure described
+                        in https://huggingface.co/docs/datasets/image_dataset#imagefolder. For 🤗 Datasets in particular, a `metadata.jsonl` file must exist to provide the captions for the images. For SimpleTuner layout, the
+                        images can be in subfolders. No particular config is required. Ignored if `dataset_name` is specified.
+  --cache_dir_text CACHE_DIR_TEXT
+                        This is the path to a local directory that will contain your text embed cache.
+  --cache_dir_vae CACHE_DIR_VAE
+                        This is the path to a local directory that will contain your VAE outputs. Unlike the text embed cache, your VAE latents will be stored in the AWS data backend. If the AWS backend is in use, this will
+                        be a prefix for the bucket's VAE cache entries.
   --data_backend {local,aws}
                         The data backend to use. Choose between ['local', 'aws']. Default: local. If using AWS, you must set the AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables.
   --write_batch_size WRITE_BATCH_SIZE
-                        When using certain storage backends, it is better to batch smaller writes rather than continuous dispatching. In SimpleTuner, write batching is currently applied during VAE caching, when many small objects are written. This mostly applies to S3, but some shared server filesystems may benefit as well, eg. Ceph. Default: 64.
+                        When using certain storage backends, it is better to batch smaller writes rather than continuous dispatching. In SimpleTuner, write batching is currently applied during VAE caching, when many small
+                        objects are written. This mostly applies to S3, but some shared server filesystems may benefit as well, eg. Ceph. Default: 64.
   --apply_dataset_padding
-                        If set, will apply padding to the dataset to ensure that the number of images is divisible by the batch. This has some side-effects (especially on smaller datasets) of over-sampling and overly repeating images.
+                        If set, will apply padding to the dataset to ensure that the number of images is divisible by the batch. This has some side-effects (especially on smaller datasets) of over-sampling and overly
+                        repeating images.
   --aws_config_file AWS_CONFIG_FILE
                         Path to the AWS configuration file in JSON format. Config key names are the same as SimpleTuner option counterparts.
   --aws_bucket_name AWS_BUCKET_NAME
                         The AWS bucket name to use.
+  --aws_bucket_image_prefix AWS_BUCKET_IMAGE_PREFIX
+                        Instead of using --instance_data_dir, AWS S3 relies on aws_bucket_*_prefix parameters. When provided, this parameter will be prepended to the image path.
   --aws_endpoint_url AWS_ENDPOINT_URL
                         The AWS server to use. If not specified, will use the default server for the region specified. For Wasabi, use https://s3.wasabisys.com.
   --aws_region_name AWS_REGION_NAME
@@ -212,8 +237,12 @@ options:
                         The AWS secret access key.
   --cache_dir CACHE_DIR
                         The directory where the downloaded models and datasets will be stored.
+  --cache_clear_validation_prompts
+                        When provided, any validation prompt entries in the text embed cache will be recreated. This is useful if you've modified any of the existing prompts, or, disabled/enabled Compel, via
+                        `--disable_compel`
   --dataset_name DATASET_NAME
-                        The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private, dataset). It can also be a path pointing to a local copy of a dataset in your filesystem, or to a folder containing files that 🤗 Datasets can understand.
+                        The name of the Dataset (from the HuggingFace hub) to train on (could be your own, possibly private, dataset). It can also be a path pointing to a local copy of a dataset in your filesystem, or to a
+                        folder containing files that 🤗 Datasets can understand.
   --dataset_config_name DATASET_CONFIG_NAME
                         The config of the Dataset, leave as None if there's only one config.
   --image_column IMAGE_COLUMN
@@ -225,26 +254,29 @@ options:
   --state_path STATE_PATH
                         A JSON document containing the current state of training, will be placed here.
   --caption_strategy {filename,textfile,instance_prompt}
-                        The default captioning strategy, 'filename', will use the filename as the caption, after stripping some characters like underscores.The 'textfile' strategy will use the contents of a text file with the same name as the image.
+                        The default captioning strategy, 'filename', will use the filename as the caption, after stripping some characters like underscores.The 'textfile' strategy will use the contents of a text file with
+                        the same name as the image.
   --instance_prompt INSTANCE_PROMPT
                         This is unused. Filenames will be the captions instead.
   --output_dir OUTPUT_DIR
                         The output directory where the model predictions and checkpoints will be written.
   --seed SEED           A seed for reproducible training.
-  --seed_for_each_device
-                        If provided, a unique seed will be used for each GPU. This is done deterministically, so that each GPU will receive the same seed across invocations.
+  --seed_for_each_device SEED_FOR_EACH_DEVICE
+                        By default, a unique seed will be used for each GPU. This is done deterministically, so that each GPU will receive the same seed across invocations. If --seed_for_each_device=false is provided, then
+                        we will use the same seed across all GPUs, which will almost certainly result in the over-sampling of inputs on larger datasets.
   --resolution RESOLUTION
                         The resolution for input images, all the images in the train/validation dataset will be resized to this resolution. If using --resolution_type=area, this float value represents megapixels.
   --resolution_type {pixel,area}
-                        Resizing images maintains aspect ratio. This defines the resizing strategy. If 'pixel', the images will be resized to the resolution by pixel edge. If 'area', the images will be resized so the pixel area is this many megapixels.
+                        Resizing images maintains aspect ratio. This defines the resizing strategy. If 'pixel', the images will be resized to the resolution by pixel edge. If 'area', the images will be resized so the pixel
+                        area is this many megapixels.
   --minimum_image_size MINIMUM_IMAGE_SIZE
                         The minimum resolution for both sides of input images. If --delete_unwanted_images is set, images smaller than this will be DELETED.
   --crops_coords_top_left_h CROPS_COORDS_TOP_LEFT_H
                         Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet.
   --crops_coords_top_left_w CROPS_COORDS_TOP_LEFT_W
                         Coordinate for (the height) to be included in the crop coordinate embeddings needed by SDXL UNet.
-  --center_crop         Whether to center crop the input images to the resolution. If not set, the images will be randomly cropped. The images will be resized to the resolution first before cropping. If training SDXL, the VAE cache and aspect bucket cache will need to be (re)built so they
-                        include crop coordinates.
+  --center_crop         Whether to center crop the input images to the resolution. If not set, the images will be randomly cropped. The images will be resized to the resolution first before cropping. If training SDXL, the
+                        VAE cache and aspect bucket cache will need to be (re)built so they include crop coordinates.
   --random_flip         whether to randomly flip images horizontally
   --train_text_encoder  Whether to train the text encoder. If set, the text encoder should be float32 precision.
   --train_batch_size TRAIN_BATCH_SIZE
@@ -255,8 +287,9 @@ options:
   --max_train_steps MAX_TRAIN_STEPS
                         Total number of training steps to perform. If provided, overrides num_train_epochs.
   --checkpointing_steps CHECKPOINTING_STEPS
-                        Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. In the case that the checkpoint is better than the final trained model, the checkpoint can also be used for inference.Using a
-                        checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components. See https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by step instructions.
+                        Save a checkpoint of the training state every X updates. Checkpoints can be used for resuming training via `--resume_from_checkpoint`. In the case that the checkpoint is better than the final trained
+                        model, the checkpoint can also be used for inference.Using a checkpoint for inference requires separate loading of the original pipeline and the individual checkpointed model components.See
+                        https://huggingface.co/docs/diffusers/main/en/training/dreambooth#performing-inference-using-a-saved-checkpoint for step by stepinstructions.
   --checkpoints_total_limit CHECKPOINTS_TOTAL_LIMIT
                         Max number of checkpoints to store.
   --resume_from_checkpoint RESUME_FROM_CHECKPOINT
@@ -278,6 +311,8 @@ options:
   --use_ema             Whether to use EMA (exponential moving average) model.
   --non_ema_revision NON_EMA_REVISION
                         Revision of pretrained non-ema model identifier. Must be a branch, tag or git identifier of the local or remote repository specified with --pretrained_model_name_or_path.
+  --offload_param_path OFFLOAD_PARAM_PATH
+                        When using DeepSpeed ZeRo stage 2 or 3 with NVMe offload, this may be specified to provide a path for the offload.
   --use_8bit_adam       Whether or not to use 8-bit Adam from bitsandbytes.
   --use_adafactor_optimizer
                         Whether or not to use the Adafactor optimizer.
@@ -285,8 +320,6 @@ options:
                         Whether or not to use the discriminator adaptation optimizer.
   --dadaptation_learning_rate DADAPTATION_LEARNING_RATE
                         Learning rate for the discriminator adaptation. Default: 1.0
-  --dataloader_num_workers DATALOADER_NUM_WORKERS
-                        Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process.
   --adam_beta1 ADAM_BETA1
                         The beta1 parameter for the Adam optimizer.
   --adam_beta2 ADAM_BETA2
@@ -304,11 +337,16 @@ options:
                         The name of the repository to keep in sync with the local `output_dir`.
   --logging_dir LOGGING_DIR
                         [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
+  --validation_torch_compile VALIDATION_TORCH_COMPILE
+                        Supply `--validation_torch_compile=true` to enable the use of torch.compile() on the validation pipeline. For some setups, torch.compile() may error out. This is dependent on PyTorch version, phase of
+                        the moon, but if it works, you should leave it enabled for a great speed-up.
+  --validation_torch_compile_mode {reduce-overhead,default}
+                        PyTorch provides different modes for the Torch Inductor when compiling graphs. reduce-overhead, the default mode, provides the most benefit.
   --allow_tf32          Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
   --report_to REPORT_TO
                         The integration to report the results and logs to. Supported platforms are `"tensorboard"` (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.
-  --track_luminance     When provided, the luminance of the images will be tracked during training. This has a pretty substantial compute cost for higher resolution images, though it is easily justified when training with offset noise or some other noise modification technique that could bias
-                        the model toward very-dark images.
+  --track_luminance     When provided, the luminance of the images will be tracked during training. This has a pretty substantial compute cost for higher resolution images, though it is easily justified when training with
+                        offset noise or some other noise modification technique that could bias the model toward very-dark images.
   --tracker_run_name TRACKER_RUN_NAME
                         The name of the run to track with the tracker.
   --tracker_project_name TRACKER_PROJECT_NAME
@@ -324,21 +362,26 @@ options:
   --validation_steps VALIDATION_STEPS
                         Run validation every X steps. Validation consists of running the prompt `args.validation_prompt` multiple times: `args.num_validation_images` and logging the images.
   --validation_num_inference_steps VALIDATION_NUM_INFERENCE_STEPS
-                        The default scheduler, DDIM, benefits from more steps. UniPC can do well with just 10-15. For more speed during validations, reduce this value. For better quality, increase it. For model distilation, you will likely want to keep this low.
+                        The default scheduler, DDIM, benefits from more steps. UniPC can do well with just 10-15. For more speed during validations, reduce this value. For better quality, increase it. For model distilation,
+                        you will likely want to keep this low.
   --validation_resolution VALIDATION_RESOLUTION
                         Square resolution images will be output at this resolution (256x256).
   --validation_noise_scheduler {ddim,ddpm,euler,euler-a,unipc}
-                        When validating the model at inference time, a different scheduler may be chosen. UniPC can offer better speed, and Euler A can put up with instabilities a bit better. For zero-terminal SNR models, DDIM is the best choice. Choices: ['ddim', 'ddpm', 'euler', 'euler-a', 'unipc'], Default: ddim
-  --enable_watermark    The SDXL 0.9 and 1.0 licenses both require a watermark be used to identify any images created to be shared. Since the images created during validation typically are not shared, and we want the most accurate results, this watermarker is disabled by default. If you are
-                        sharing the validation images, it is up to you to ensure that you are complying with the license, whether that is through this watermarker, or another.
+                        When validating the model at inference time, a different scheduler may be chosen. UniPC can offer better speed, and Euler A can put up with instabilities a bit better. For zero-terminal SNR models,
+                        DDIM is the best choice. Choices: ['ddim', 'ddpm', 'euler', 'euler-a', 'unipc'], Default: ddim
+  --disable_compel      If provided, prompts will be handled using the typical prompt encoding strategy. Otherwise, the default behaviour is to use Compel for prompt embed generation.
+  --enable_watermark    The SDXL 0.9 and 1.0 licenses both require a watermark be used to identify any images created to be shared. Since the images created during validation typically are not shared, and we want the most
+                        accurate results, this watermarker is disabled by default. If you are sharing the validation images, it is up to you to ensure that you are complying with the license, whether that is through this
+                        watermarker, or another.
   --mixed_precision {no,fp16,bf16}
-                        Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config.
+                        Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the
+                        flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config.
   --local_rank LOCAL_RANK
                         For distributed training: local_rank
   --enable_xformers_memory_efficient_attention
                         Whether or not to use xformers.
-  --set_grads_to_none   Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain behaviors, so disable this argument if it causes any problems. 
-                        More info: https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
+  --set_grads_to_none   Save more memory by using setting grads to None instead of zero. Be aware, that this changes certain behaviors, so disable this argument if it causes any problems. More info:
+                        https://pytorch.org/docs/stable/generated/torch.optim.Optimizer.zero_grad.html
   --noise_offset NOISE_OFFSET
                         The scale of noise offset. Default: 0.1
   --validation_epochs VALIDATION_EPOCHS
@@ -352,13 +395,15 @@ options:
   --validation_seed VALIDATION_SEED
                         If not supplied, the value for --seed will be used. If neither those nor --validation_randomize are supplied, a seed of zero is used.
   --fully_unload_text_encoder
-                        If set, will fully unload the text_encoder from memory when not in use. This currently has the side effect of crashing validations, but it is useful for initiating VAE caching on GPUs that would otherwise be too small.
+                        If set, will fully unload the text_encoder from memory when not in use. This currently has the side effect of crashing validations, but it is useful for initiating VAE caching on GPUs that would
+                        otherwise be too small.
   --freeze_encoder_before FREEZE_ENCODER_BEFORE
                         When using 'before' strategy, we will freeze layers earlier than this.
   --freeze_encoder_after FREEZE_ENCODER_AFTER
                         When using 'after' strategy, we will freeze layers later than this.
   --freeze_encoder_strategy FREEZE_ENCODER_STRATEGY
-                        When freezing the text_encoder, we can use the 'before', 'between', or 'after' strategy. The 'between' strategy will freeze layers between those two values, leaving the outer layers unfrozen. The default strategy is to freeze all layers from 17 up. This can be helpful when fine-tuning Stable Diffusion 2.1 on a new style.
+                        When freezing the text_encoder, we can use the 'before', 'between', or 'after' strategy. The 'between' strategy will freeze layers between those two values, leaving the outer layers unfrozen. The
+                        default strategy is to freeze all layers from 17 up. This can be helpful when fine-tuning Stable Diffusion 2.1 on a new style.
   --print_filenames     If any image files are stopping the process eg. due to corruption or truncation, this will help identify which is at fault.
   --debug_aspect_buckets
                         If set, will print excessive debugging for aspect bucket operations.
@@ -372,7 +417,8 @@ options:
   --only_instance_prompt
                         Use the instance prompt instead of the caption from filename.
   --caption_dropout_interval CAPTION_DROPOUT_INTERVAL
-                        Every X steps, we will drop the caption from the input to assist in classifier-free guidance training. When StabilityAI trained Stable Diffusion, a value of 10 was used. Very high values might be useful to do some sort of enforced style training. Default value is zero, maximum value is 100.
+                        Every X steps, we will drop the caption from the input to assist in classifier-free guidance training. When StabilityAI trained Stable Diffusion, a value of 10 was used. Very high values might be
+                        useful to do some sort of enforced style training. Default value is zero, maximum value is 100.
   --conditioning_dropout_probability CONDITIONING_DROPOUT_PROBABILITY
                         Conditioning dropout probability. Experimental. See section 3.2.1 in the paper: https://arxiv.org/abs/2211.09800.
   --caption_dropout_probability CAPTION_DROPOUT_PROBABILITY
@@ -388,4 +434,4 @@ options:
   --offset_noise        Fine-tuning against a modified noise See: https://www.crosslabs.org//blog/diffusion-with-offset-noise for more information.
   --learning_rate_end LEARNING_RATE_END
                         A polynomial learning rate will end up at this value after the specified number of warmup steps.
-```
\ No newline at end of file
+                        ```
\ No newline at end of file

From 465430b999f90dcb1c1623c85a85d306658cb3ad Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Fri, 13 Oct 2023 11:49:18 -0700
Subject: [PATCH 02/24] Fix for zero-training run

---
 train_sdxl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/train_sdxl.py b/train_sdxl.py
index 7efe1a75..837d01f2 100644
--- a/train_sdxl.py
+++ b/train_sdxl.py
@@ -818,7 +818,9 @@ def main():
 
     # Prepare everything with our `accelerator`.
     logger.info(f"Loading our accelerator...")
-    unet, train_dataloader, lr_scheduler, optimizer = accelerator.prepare(unet, train_dataloader, lr_scheduler, optimizer)
+    unet, train_dataloader, lr_scheduler, optimizer = accelerator.prepare(
+        unet, train_dataloader, lr_scheduler, optimizer
+    )
     if args.use_ema:
         logger.info("Moving EMA model weights to accelerator...")
         ema_unet.to(accelerator.device, dtype=weight_dtype)
@@ -1022,6 +1024,7 @@ def main():
     accelerator.wait_for_everyone()
     timesteps_buffer = []
     train_loss = 0.0
+    step = global_step
     training_luminance_values = []
 
     for epoch in range(first_epoch, args.num_train_epochs):

From 4e855c9126e3625a913933297abbffef0eacaf0b Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Fri, 13 Oct 2023 12:01:29 -0700
Subject: [PATCH 03/24] BucketManager: reduce CPU use of busy-waiting by
 sleeping a bit longer in between cycles. Better labeling of the status bar,
 and less frequent updates.

---
 helpers/multiaspect/bucket.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/helpers/multiaspect/bucket.py b/helpers/multiaspect/bucket.py
index bedfab1b..73330a29 100644
--- a/helpers/multiaspect/bucket.py
+++ b/helpers/multiaspect/bucket.py
@@ -2,7 +2,7 @@
 from helpers.multiaspect.image import MultiaspectImage
 from helpers.data_backend.base import BaseDataBackend
 from pathlib import Path
-import json, logging, os
+import json, logging, os, time
 from multiprocessing import Manager
 from tqdm import tqdm
 from multiprocessing import Process, Queue
@@ -217,7 +217,13 @@ def compute_aspect_ratio_bucket_indices(self):
         for worker in workers:
             worker.start()
 
-        with tqdm(total=len(new_files), leave=False, ncols=100) as pbar:
+        with tqdm(
+            desc="Generating aspect bucket cache",
+            total=len(new_files),
+            leave=False,
+            ncols=100,
+            miniters=int(len(new_files) / 100),
+        ) as pbar:
             while any(worker.is_alive() for worker in workers):
                 while not tqdm_queue.empty():
                     pbar.update(tqdm_queue.get())
@@ -237,6 +243,8 @@ def compute_aspect_ratio_bucket_indices(self):
                             filepath=filepath, metadata=meta, update_json=False
                         )
 
+                time.sleep(0.1)
+
         for worker in workers:
             worker.join()
 

From 048c00a9bcc036a53dc5a41f4f8a347488a13dc8 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Fri, 13 Oct 2023 20:00:30 -0700
Subject: [PATCH 04/24] Disable PIL RGBA warning.

---
 helpers/log_format.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/helpers/log_format.py b/helpers/log_format.py
index f0b270b9..222dbe3d 100644
--- a/helpers/log_format.py
+++ b/helpers/log_format.py
@@ -27,7 +27,9 @@ def format(self, record):
 accel_logger = logging.getLogger("DeepSpeed")
 accel_logger.setLevel(logging.WARNING)
 new_handler = logging.StreamHandler()
-new_handler.setFormatter(ColorizedFormatter("%(asctime)s [%(levelname)s] (%(name)s) %(message)s"))
+new_handler.setFormatter(
+    ColorizedFormatter("%(asctime)s [%(levelname)s] (%(name)s) %(message)s")
+)
 # Remove existing handlers
 for handler in logger.handlers[:]:
     logger.removeHandler(handler)
@@ -43,3 +45,13 @@ def format(self, record):
 pil_logger.setLevel("WARNING")
 pil_logger = logging.getLogger("PIL.PngImagePlugin")
 pil_logger.setLevel("WARNING")
+
+import warnings
+
+# Suppress specific PIL warning
+warnings.filterwarnings(
+    "ignore",
+    category=UserWarning,
+    module="PIL",
+    message="Palette images with Transparency expressed in bytes should be converted to RGBA images",
+)

From 6ce30a3ef9792c6cd24a3023722b632ff762f21e Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Fri, 13 Oct 2023 20:23:24 -0700
Subject: [PATCH 05/24] Disable Transformers configuration_utils logging

---
 helpers/log_format.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/helpers/log_format.py b/helpers/log_format.py
index 222dbe3d..59ffa668 100644
--- a/helpers/log_format.py
+++ b/helpers/log_format.py
@@ -42,9 +42,11 @@ def format(self, record):
 pil_logger = logging.getLogger("PIL")
 pil_logger.setLevel(logging.INFO)
 pil_logger = logging.getLogger("PIL.Image")
-pil_logger.setLevel("WARNING")
+pil_logger.setLevel("ERROR")
 pil_logger = logging.getLogger("PIL.PngImagePlugin")
-pil_logger.setLevel("WARNING")
+pil_logger.setLevel("ERROR")
+pil_logger = logging.getLogger("transformers.configuration_utils")
+pil_logger.setLevel("ERROR")
 
 import warnings
 

From bec331340c162b9191989d718585c26912287cc2 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Fri, 13 Oct 2023 20:24:18 -0700
Subject: [PATCH 06/24] Disable Diffusers configuration_utils logging

---
 helpers/log_format.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/helpers/log_format.py b/helpers/log_format.py
index 59ffa668..3778ba6a 100644
--- a/helpers/log_format.py
+++ b/helpers/log_format.py
@@ -45,8 +45,10 @@ def format(self, record):
 pil_logger.setLevel("ERROR")
 pil_logger = logging.getLogger("PIL.PngImagePlugin")
 pil_logger.setLevel("ERROR")
-pil_logger = logging.getLogger("transformers.configuration_utils")
-pil_logger.setLevel("ERROR")
+transformers_logger = logging.getLogger("transformers.configuration_utils")
+transformers_logger.setLevel("ERROR")
+diffusers_logger = logging.getLogger("diffusers.configuration_utils")
+diffusers_logger.setLevel("ERROR")
 
 import warnings
 

From 4f1874b02e9da9b92fce2a473674ef5e8922b285 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 04:39:55 -0700
Subject: [PATCH 07/24] VAECache: improve efficiency, perhaps, on multi-GPU
 systems, by reducing the number of checks we do

---
 helpers/caching/vae.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 5e17a095..4ce60bc1 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -245,6 +245,7 @@ def process_buckets(self, bucket_manager):
                 f
                 for f in aspect_bucket_cache[bucket]
                 if os.path.splitext(os.path.basename(f))[0] not in processed_images
+                and f in self.local_unprocessed_files
             ]
             logger.debug(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
@@ -276,6 +277,14 @@ def process_buckets(self, bucket_manager):
                     )
                     continue
                 try:
+                    # Does it exist on the backend?
+                    if self.data_backend.exists(
+                        self.generate_vae_cache_filename(filepath)[0]
+                    ):
+                        logger.debug(
+                            f"Skipping {filepath} because it is already in the cache"
+                        )
+                        continue
                     logger.debug(
                         f"Processing {filepath} because it is in local unprocessed files"
                     )

From 0f87cd87c1d696a2bcc0281d35f3c19fa8b6e1ee Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 08:43:40 -0700
Subject: [PATCH 08/24] VAECache: Add debug_log method

---
 helpers/caching/vae.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 4ce60bc1..9234cc96 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -8,6 +8,7 @@
 from helpers.data_backend.base import BaseDataBackend
 from helpers.training.state_tracker import StateTracker
 from helpers.training.multi_process import _get_rank as get_rank
+from helpers.training.multi_process import rank_info
 
 logger = logging.getLogger("VAECache")
 logger.setLevel(os.environ.get("SIMPLETUNER_LOG_LEVEL") or "INFO")
@@ -39,6 +40,10 @@ def __init__(
         self.vae_batch_size = vae_batch_size
         self.instance_data_root = instance_data_root
         self.transform = MultiaspectImage.get_image_transforms()
+        self.rank_info = rank_info()
+
+    def debug_log(self, msg: str):
+        logger.debug(f"{self.rank_info}{msg}", main_process_only=False)
 
     def generate_vae_cache_filename(self, filepath: str) -> tuple:
         """Get the cache filename for a given image filepath and its base name."""

From 7ec5956b47d3d394ce4fc8d1d186052240d5c71d Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 08:44:34 -0700
Subject: [PATCH 09/24] VAECache: Use debug_log method

---
 helpers/caching/vae.py | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 9234cc96..7fe41768 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -79,23 +79,25 @@ def discover_all_files(self, directory: str = None):
                 )
             )
         )
-        logger.debug(f"VAECache discover_all_files found {len(all_image_files)} images")
+        self.debug_log(
+            f"VAECache discover_all_files found {len(all_image_files)} images"
+        )
         return all_image_files
 
     def discover_unprocessed_files(self, directory: str = None):
         """Identify files that haven't been processed yet."""
         all_image_files = StateTracker.get_image_files()
         existing_cache_files = StateTracker.get_vae_cache_files()
-        logger.debug(
+        self.debug_log(
             f"discover_unprocessed_files found {len(all_image_files)} images from StateTracker (truncated): {list(all_image_files)[:5]}"
         )
-        logger.debug(
+        self.debug_log(
             f"discover_unprocessed_files found {len(existing_cache_files)} already-processed cache files (truncated): {list(existing_cache_files)[:5]}"
         )
         cache_filenames = {
             self.generate_vae_cache_filename(file)[1] for file in all_image_files
         }
-        logger.debug(
+        self.debug_log(
             f"discover_unprocessed_files found {len(cache_filenames)} cache filenames (truncated): {list(cache_filenames)[:5]}"
         )
         unprocessed_files = {
@@ -182,14 +184,16 @@ def encode_images(self, images, filepaths, load_from_cache=True):
 
     def split_cache_between_processes(self):
         all_unprocessed_files = self.discover_unprocessed_files(self.cache_dir)
-        logger.debug(f"All unprocessed files: {all_unprocessed_files[:5]} (truncated)")
+        self.debug_log(
+            f"All unprocessed files: {all_unprocessed_files[:5]} (truncated)"
+        )
         # Use the accelerator to split the data
         with self.accelerator.split_between_processes(
             all_unprocessed_files
         ) as split_files:
             self.local_unprocessed_files = split_files
         # Print the first 5 as a debug log:
-        logger.debug(
+        self.debug_log(
             f"Local unprocessed files: {self.local_unprocessed_files[:5]} (truncated)"
         )
 
@@ -252,7 +256,7 @@ def process_buckets(self, bucket_manager):
                 if os.path.splitext(os.path.basename(f))[0] not in processed_images
                 and f in self.local_unprocessed_files
             ]
-            logger.debug(
+            self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )
             if len(relevant_files) == 0:
@@ -277,7 +281,7 @@ def process_buckets(self, bucket_manager):
                     )
                 test_filepath = f"{os.path.splitext(self.generate_vae_cache_filename(filepath)[1])[0]}.png"
                 if test_filepath not in self.local_unprocessed_files:
-                    logger.debug(
+                    self.debug_log(
                         f"Skipping {test_filepath} because it is not in local unprocessed files"
                     )
                     continue
@@ -286,11 +290,11 @@ def process_buckets(self, bucket_manager):
                     if self.data_backend.exists(
                         self.generate_vae_cache_filename(filepath)[0]
                     ):
-                        logger.debug(
+                        self.debug_log(
                             f"Skipping {filepath} because it is already in the cache"
                         )
                         continue
-                    logger.debug(
+                    self.debug_log(
                         f"Processing {filepath} because it is in local unprocessed files"
                     )
                     image = self.data_backend.read_image(filepath)
@@ -318,7 +322,7 @@ def process_buckets(self, bucket_manager):
 
                 # If VAE input batch is ready
                 if len(vae_input_images) >= self.vae_batch_size:
-                    logger.debug(
+                    self.debug_log(
                         f"Reached a VAE batch size of {self.vae_batch_size} pixel groups, so we will now encode them into latents."
                     )
                     latents_batch = self.encode_images(

From e538c5107cb0ccb5ad40e4dd01907df423a6e2cd Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 08:59:15 -0700
Subject: [PATCH 10/24] VAECache: Fix debug_log method

---
 helpers/caching/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 7fe41768..b062a678 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -43,7 +43,7 @@ def __init__(
         self.rank_info = rank_info()
 
     def debug_log(self, msg: str):
-        logger.debug(f"{self.rank_info}{msg}", main_process_only=False)
+        logger.debug(f"{self.rank_info}{msg}")
 
     def generate_vae_cache_filename(self, filepath: str) -> tuple:
         """Get the cache filename for a given image filepath and its base name."""

From 6af707e699835c92bb60067c63dc0c0269be95a3 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 09:32:49 -0700
Subject: [PATCH 11/24] VAECache: Log some more details

---
 helpers/caching/vae.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index b062a678..e241bb40 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -259,6 +259,9 @@ def process_buckets(self, bucket_manager):
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )
+            self.debug_log(
+                f"We compared the basename {os.path.splitext(os.path.basename(relevant_files[0]))[0]} to the processed images (truncated) {processed_images[:5]}"
+            )
             if len(relevant_files) == 0:
                 continue
 

From 218e3f424caaf4dbfa933cf38c81e3824ac5fcf1 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 09:49:33 -0700
Subject: [PATCH 12/24] VAECache: Log some more details (fix)

---
 helpers/caching/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index e241bb40..55016d56 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -260,7 +260,7 @@ def process_buckets(self, bucket_manager):
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )
             self.debug_log(
-                f"We compared the basename {os.path.splitext(os.path.basename(relevant_files[0]))[0]} to the processed images (truncated) {processed_images[:5]}"
+                f"We compared the basename {os.path.splitext(os.path.basename(relevant_files[0]))[0]} to the processed images (truncated) {list(processed_images)[:5]}"
             )
             if len(relevant_files) == 0:
                 continue

From b0728c6b7e59ab371f20a5f086362b03f7d4ac35 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 10:01:52 -0700
Subject: [PATCH 13/24] VAECache: Log some more details (fix)

---
 helpers/caching/vae.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 55016d56..21b109e4 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -256,12 +256,25 @@ def process_buckets(self, bucket_manager):
                 if os.path.splitext(os.path.basename(f))[0] not in processed_images
                 and f in self.local_unprocessed_files
             ]
+            for sample in aspect_bucket_cache[bucket]:
+                if sample not in relevant_files:
+                    self.debug_log(
+                        f"Skipping {sample} because it is not in relevant files"
+                    )
+                    continue
+                if os.path.splitext(os.path.basename(sample))[0] in processed_images:
+                    self.debug_log(
+                        f"Skipping {sample} because it is in processed images"
+                    )
+                    continue
+                if sample not in self.local_unprocessed_files:
+                    self.debug_log(
+                        f"Skipping {sample} because it is not in local unprocessed files"
+                    )
+                    continue
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )
-            self.debug_log(
-                f"We compared the basename {os.path.splitext(os.path.basename(relevant_files[0]))[0]} to the processed images (truncated) {list(processed_images)[:5]}"
-            )
             if len(relevant_files) == 0:
                 continue
 

From 7c2eb60c004488c71a24b3cd6fdc2150a1d13a51 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 10:15:14 -0700
Subject: [PATCH 14/24] VAECache: Log the details more precisely

---
 helpers/caching/vae.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 21b109e4..39ed2e88 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -257,14 +257,10 @@ def process_buckets(self, bucket_manager):
                 and f in self.local_unprocessed_files
             ]
             for sample in aspect_bucket_cache[bucket]:
-                if sample not in relevant_files:
+                quick_piece = os.path.splitext(os.path.basename(sample))[0]
+                if quick_piece in processed_images:
                     self.debug_log(
-                        f"Skipping {sample} because it is not in relevant files"
-                    )
-                    continue
-                if os.path.splitext(os.path.basename(sample))[0] in processed_images:
-                    self.debug_log(
-                        f"Skipping {sample} because it is in processed images"
+                        f"Skipping {quick_piece} because it is in processed images"
                     )
                     continue
                 if sample not in self.local_unprocessed_files:
@@ -272,6 +268,9 @@ def process_buckets(self, bucket_manager):
                         f"Skipping {sample} because it is not in local unprocessed files"
                     )
                     continue
+                self.debug_log(
+                    f"Processing {sample}  (quick_piece {quick_piece}) because it is in local unprocessed files"
+                )
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )

From ca721ac4db015ce0ef25d27c8d309512d0e9fb43 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 10:32:40 -0700
Subject: [PATCH 15/24] VAECache: Log more details, and a subset of the data

---
 helpers/caching/vae.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 39ed2e88..dcc5765a 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -114,9 +114,12 @@ def _list_cached_images(self):
         """
         # Extract array of tuple into just, an array of files:
         pt_files = StateTracker.get_vae_cache_files()
-        logging.debug(f"Found {len(pt_files)} cached files in {self.cache_dir}")
         # Extract just the base filename without the extension
-        return {os.path.splitext(f)[0] for f in pt_files}
+        results = {os.path.splitext(f)[0] for f in pt_files}
+        logging.debug(
+            f"Found {len(pt_files)} cached files in {self.cache_dir} (truncated): {results[:5]}"
+        )
+        return results
 
     def encode_image(self, image, filepath):
         """

From 4163d714bbd356ac7d0b5dac1011d794368db396 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 10:55:10 -0700
Subject: [PATCH 16/24] VAECache: Log more details, and a subset of the data

---
 helpers/caching/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index dcc5765a..b734f0c8 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -117,7 +117,7 @@ def _list_cached_images(self):
         # Extract just the base filename without the extension
         results = {os.path.splitext(f)[0] for f in pt_files}
         logging.debug(
-            f"Found {len(pt_files)} cached files in {self.cache_dir} (truncated): {results[:5]}"
+            f"Found {len(pt_files)} cached files in {self.cache_dir} (truncated): {list(results)[:5]}"
         )
         return results
 

From 1e3bedd4068d4400914df193146ef8cafaa88315 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 12:13:23 -0700
Subject: [PATCH 17/24] VAECache: Log more details

---
 helpers/caching/vae.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index b734f0c8..044484b7 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -272,7 +272,7 @@ def process_buckets(self, bucket_manager):
                     )
                     continue
                 self.debug_log(
-                    f"Processing {sample}  (quick_piece {quick_piece}) because it is in local unprocessed files"
+                    f"Processing bucket {bucket} sample {sample}  (quick_piece {quick_piece}) because it is in local unprocessed files"
                 )
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"

From 368ad8d77e66c0a05208a4bbf2cb451416f2d964 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 12:45:07 -0700
Subject: [PATCH 18/24] VAECache: Make aspect shuffle opt-out

---
 helpers/caching/vae.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 044484b7..2519a2d0 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -249,8 +249,10 @@ def process_buckets(self, bucket_manager):
         aspect_bucket_cache = bucket_manager.read_cache().copy()
 
         # Extract and shuffle the keys of the dictionary
-        shuffled_keys = list(aspect_bucket_cache.keys())
-        shuffle(shuffled_keys)
+        do_shuffle = os.environ.get('SIMPLETUNER_SHUFFLE_ASPECTS', 'true').lower() == 'true'
+        if do_shuffle:
+            shuffled_keys = list(aspect_bucket_cache.keys())
+            shuffle(shuffled_keys)
 
         for bucket in shuffled_keys:
             relevant_files = [

From 20a3f9fcd2b9194732a2a8c85bb1f6999dbce426 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 13:12:11 -0700
Subject: [PATCH 19/24] VAECache: More logging

---
 helpers/caching/vae.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 2519a2d0..7a37539d 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -105,7 +105,15 @@ def discover_unprocessed_files(self, directory: str = None):
             for file in cache_filenames
             if file not in existing_cache_files
         }
-
+        for file in cache_filenames:
+            if file not in existing_cache_files:
+                self.debug_log(
+                    f"discover_unprocessed_files: {file} is not in existing_cache_files"
+                )
+            else:
+                self.debug_log(
+                    f"discover_unprocessed_files: {file} is in existing_cache_files"
+                )
         return list(unprocessed_files)
 
     def _list_cached_images(self):
@@ -264,9 +272,6 @@ def process_buckets(self, bucket_manager):
             for sample in aspect_bucket_cache[bucket]:
                 quick_piece = os.path.splitext(os.path.basename(sample))[0]
                 if quick_piece in processed_images:
-                    self.debug_log(
-                        f"Skipping {quick_piece} because it is in processed images"
-                    )
                     continue
                 if sample not in self.local_unprocessed_files:
                     self.debug_log(

From 1456fae860d871b4b1db4ee732386e8002521520 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 13:29:36 -0700
Subject: [PATCH 20/24] VAECache: Reduce logging

---
 helpers/caching/vae.py | 27 ++++++---------------------
 1 file changed, 6 insertions(+), 21 deletions(-)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 7a37539d..5088c1c5 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -105,15 +105,6 @@ def discover_unprocessed_files(self, directory: str = None):
             for file in cache_filenames
             if file not in existing_cache_files
         }
-        for file in cache_filenames:
-            if file not in existing_cache_files:
-                self.debug_log(
-                    f"discover_unprocessed_files: {file} is not in existing_cache_files"
-                )
-            else:
-                self.debug_log(
-                    f"discover_unprocessed_files: {file} is in existing_cache_files"
-                )
         return list(unprocessed_files)
 
     def _list_cached_images(self):
@@ -269,18 +260,6 @@ def process_buckets(self, bucket_manager):
                 if os.path.splitext(os.path.basename(f))[0] not in processed_images
                 and f in self.local_unprocessed_files
             ]
-            for sample in aspect_bucket_cache[bucket]:
-                quick_piece = os.path.splitext(os.path.basename(sample))[0]
-                if quick_piece in processed_images:
-                    continue
-                if sample not in self.local_unprocessed_files:
-                    self.debug_log(
-                        f"Skipping {sample} because it is not in local unprocessed files"
-                    )
-                    continue
-                self.debug_log(
-                    f"Processing bucket {bucket} sample {sample}  (quick_piece {quick_piece}) because it is in local unprocessed files"
-                )
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )
@@ -332,6 +311,9 @@ def process_buckets(self, bucket_manager):
                     )
                     vae_input_images.append(pixel_values)
                     vae_input_filepaths.append(filepath)
+                    self.debug_log(
+                        f"Completed processing {filepath}"
+                    )
                 except ValueError as e:
                     logger.error(f"Received fatal error: {e}")
                     raise e
@@ -374,6 +356,9 @@ def process_buckets(self, bucket_manager):
 
             # Handle remainders after processing the bucket
             if vae_input_images:  # If there are images left to be encoded
+                self.debug_log(
+                    f"Processing the remainder, {len(vae_input_images)} images"
+                )
                 latents_batch = self.encode_images(
                     vae_input_images, vae_input_filepaths, load_from_cache=False
                 )

From a9a68b4d19cf5786ad2b36e704a8bdf7d073d43f Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 16:38:38 -0700
Subject: [PATCH 21/24] VAECache: Do not keep prompt embeds output value around
 after we no longer need it

---
 helpers/caching/sdxl_embeds.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/helpers/caching/sdxl_embeds.py b/helpers/caching/sdxl_embeds.py
index a6140fb1..376aea89 100644
--- a/helpers/caching/sdxl_embeds.py
+++ b/helpers/caching/sdxl_embeds.py
@@ -95,6 +95,7 @@ def encode_sdxl_prompt(
             # We are always interested in the pooled output of the final text encoder
             pooled_prompt_embeds = prompt_embeds_output[0]
             prompt_embeds = prompt_embeds_output.hidden_states[-2]
+            del prompt_embeds_output
             bs_embed, seq_len, _ = prompt_embeds.shape
             prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
 

From b8f447bd9b59b1c847c5d6950c39c4ff8185b1f7 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 21:26:08 -0700
Subject: [PATCH 22/24] VAECache: Shuffle aspect bucket contents by default
 when processing

---
 helpers/caching/vae.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/helpers/caching/vae.py b/helpers/caching/vae.py
index 5088c1c5..ac0d5223 100644
--- a/helpers/caching/vae.py
+++ b/helpers/caching/vae.py
@@ -260,6 +260,8 @@ def process_buckets(self, bucket_manager):
                 if os.path.splitext(os.path.basename(f))[0] not in processed_images
                 and f in self.local_unprocessed_files
             ]
+            if do_shuffle:
+                shuffle(relevant_files)
             self.debug_log(
                 f"Reduced bucket {bucket} down from {len(aspect_bucket_cache[bucket])} to {len(relevant_files)} relevant files"
             )

From 4d764264d212df95916994f96076bc07c29163d4 Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 21:26:40 -0700
Subject: [PATCH 23/24] S3DataBackend: Increase data rate for list files

---
 helpers/data_backend/aws.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/helpers/data_backend/aws.py b/helpers/data_backend/aws.py
index 78db47d3..83a176dc 100644
--- a/helpers/data_backend/aws.py
+++ b/helpers/data_backend/aws.py
@@ -183,7 +183,7 @@ def list_files(self, str_pattern: str, instance_data_root: str = None):
         )
 
         # Paginating over the entire bucket objects
-        for page in paginator.paginate(Bucket=self.bucket_name):
+        for page in paginator.paginate(Bucket=self.bucket_name, MaxKeys=10000):
             for obj in page.get("Contents", []):
                 # Filter based on the provided pattern
                 if fnmatch.fnmatch(obj["Key"], pattern):

From 09b0426a484027b4bb70c7cc00b4166a89c0329a Mon Sep 17 00:00:00 2001
From: bghira <bghira@users.github.com>
Date: Sat, 14 Oct 2023 21:27:11 -0700
Subject: [PATCH 24/24] Documentation updates, memory use optimisations

---
 documentation/DEEPSPEED.md     |  2 +-
 helpers/caching/sdxl_embeds.py |  1 +
 train_sdxl.py                  | 16 +++++++++-------
 3 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/documentation/DEEPSPEED.md b/documentation/DEEPSPEED.md
index b4058f7b..a1dbd899 100644
--- a/documentation/DEEPSPEED.md
+++ b/documentation/DEEPSPEED.md
@@ -13,7 +13,7 @@ SimpleTuner v0.7 includes preliminary support for training SDXL using DeepSpeed
 |                               |                      |               MIG M. |
 |===============================+======================+======================|
 |   0  NVIDIA GeForce ...  Off  | 00000000:08:00.0 Off |                  Off |
-|  0%   43C    P2   100W / 450W |   9237MiB / 24564MiB |      0%      Default |
+|  0%   43C    P2   100W / 450W |   9237MiB / 24564MiB |    100%      Default |
 |                               |                      |                  N/A |
 +-------------------------------+----------------------+----------------------+
                                                                                
diff --git a/helpers/caching/sdxl_embeds.py b/helpers/caching/sdxl_embeds.py
index 376aea89..91b4fd44 100644
--- a/helpers/caching/sdxl_embeds.py
+++ b/helpers/caching/sdxl_embeds.py
@@ -100,6 +100,7 @@ def encode_sdxl_prompt(
             prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
 
             # Clear out anything we moved to the text encoder device
+            text_input_ids.to('cpu')
             del text_input_ids
 
             prompt_embeds_list.append(prompt_embeds)
diff --git a/train_sdxl.py b/train_sdxl.py
index 837d01f2..8a0f7547 100644
--- a/train_sdxl.py
+++ b/train_sdxl.py
@@ -817,13 +817,15 @@ def main():
     accelerator.register_load_state_pre_hook(model_hooks.load_model_hook)
 
     # Prepare everything with our `accelerator`.
-    logger.info(f"Loading our accelerator...")
-    unet, train_dataloader, lr_scheduler, optimizer = accelerator.prepare(
-        unet, train_dataloader, lr_scheduler, optimizer
-    )
-    if args.use_ema:
-        logger.info("Moving EMA model weights to accelerator...")
-        ema_unet.to(accelerator.device, dtype=weight_dtype)
+    disable_accelerator = os.environ.get('SIMPLETUNER_DISABLE_ACCELERATOR', False)
+    if not disable_accelerator:
+        logger.info(f"Loading our accelerator...")
+        unet, train_dataloader, lr_scheduler, optimizer = accelerator.prepare(
+            unet, train_dataloader, lr_scheduler, optimizer
+        )
+        if args.use_ema:
+            logger.info("Moving EMA model weights to accelerator...")
+            ema_unet.to(accelerator.device, dtype=weight_dtype)
 
     # Move vae, unet and text_encoder to device and cast to weight_dtype
     # The VAE is in float32 to avoid NaN losses.