Stability-AI · gen-ai-experts · Jun 27, 2023
diff --git a/README.md b/README.md
@@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa
 
 *Stable UnCLIP 2.1*
 
-- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution,  based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
+- New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution,  based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
 
 - A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
 

diff --git a/ldm/util.py b/ldm/util.py
@@ -9,6 +9,7 @@
 
 
 def autocast(f):
+    """ Decorator for autocasting inside a function """
     def do_autocast(*args, **kwargs):
         with torch.cuda.amp.autocast(enabled=True,
                                      dtype=torch.get_autocast_gpu_dtype(),
@@ -19,6 +20,7 @@ def do_autocast(*args, **kwargs):
 
 
 def log_txt_as_img(wh, xc, size=10):
+    """ Convert a list of strings to a list of images """
     # wh a tuple of (width, height)
     # xc a list of captions to plot
     b = len(xc)

diff --git a/scripts/gradio/depth2img.py b/scripts/gradio/depth2img.py
@@ -17,6 +17,7 @@
 
 
 def initialize_model(config, ckpt):
+    """ Initialize model from config and checkpoint. """
     config = OmegaConf.load(config)
     model = instantiate_from_config(config.model)
     model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@@ -35,6 +36,7 @@ def make_batch_sd(
         num_samples=1,
         model_type="dpt_hybrid"
 ):
+    """ Make batch for sampling from image and text. """
     image = np.array(image.convert("RGB"))
     image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
     # sample['jpg'] is tensor hwc in [-1, 1] at this point
@@ -54,6 +56,7 @@ def make_batch_sd(
 
 def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
           do_full_sample=False):
+    """ Paint image from text prompt. """
     device = torch.device(
         "cuda") if torch.cuda.is_available() else torch.device("cpu")
     model = sampler.model
@@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No
 
 
 def pad_image(input_image):
+    """ Pad image to integer multiple of 32. """
     pad_w, pad_h = np.max(((2, 2), np.ceil(
         np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
     im_padded = Image.fromarray(
@@ -121,6 +125,7 @@ def pad_image(input_image):
 
 
 def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
+    """ Predict image from text prompt. """
     init_image = input_image.convert("RGB")
     image = pad_image(init_image)  # resize to integer multiple of 32
 

diff --git a/scripts/gradio/inpainting.py b/scripts/gradio/inpainting.py
@@ -17,6 +17,7 @@
 
 
 def put_watermark(img, wm_encoder=None):
+    """ Put watermark on image. """
     if wm_encoder is not None:
         img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
         img = wm_encoder.encode(img, 'dwtDct')
@@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None):
 
 
 def initialize_model(config, ckpt):
+    """ Initialize model from config and checkpoint. """
     config = OmegaConf.load(config)
     model = instantiate_from_config(config.model)
 
@@ -44,6 +46,7 @@ def make_batch_sd(
         txt,
         device,
         num_samples=1):
+    """ Make batch for sampling from image and text. """
     image = np.array(image.convert("RGB"))
     image = image[None].transpose(0, 3, 1, 2)
     image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
@@ -67,6 +70,7 @@ def make_batch_sd(
 
 
 def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
+    """ Inpaint image with prompt. """
     device = torch.device(
         "cuda") if torch.cuda.is_available() else torch.device("cpu")
     model = sampler.model
@@ -135,6 +139,7 @@ def pad_image(input_image):
     return im_padded
 
 def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
+    """ Predict with prompt. """
     init_image = input_image["image"].convert("RGB")
     init_mask = input_image["mask"].convert("RGB")
     image = pad_image(init_image) # resize to integer multiple of 32

diff --git a/scripts/gradio/superresolution.py b/scripts/gradio/superresolution.py
@@ -18,6 +18,7 @@
 
 
 def initialize_model(config, ckpt):
+    """ Initialize model from config and checkpoint. """
     config = OmegaConf.load(config)
     model = instantiate_from_config(config.model)
     model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
@@ -35,6 +36,7 @@ def make_batch_sd(
         device,
         num_samples=1,
 ):
+    """ Make batch for sampling from image and text. """
     image = np.array(image.convert("RGB"))
     image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
     batch = {
@@ -47,13 +49,15 @@ def make_batch_sd(
 
 
 def make_noise_augmentation(model, batch, noise_level=None):
+    """ Make noise augmentation for low scale model. """
     x_low = batch[model.low_scale_key]
     x_low = x_low.to(memory_format=torch.contiguous_format).float()
     x_aug, noise_level = model.low_scale_model(x_low, noise_level)
     return x_aug, noise_level
 
 
 def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
+    """ Paint image from text prompt. """
     device = torch.device(
         "cuda") if torch.cuda.is_available() else torch.device("cpu")
     model = sampler.model
@@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb
 
 
 def pad_image(input_image):
+    """ Pad image to integer multiple of 32. """
     pad_w, pad_h = np.max(((2, 2), np.ceil(
         np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
     im_padded = Image.fromarray(
@@ -128,6 +133,7 @@ def pad_image(input_image):
 
 
 def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
+    """ Predict image from text prompt. """
     init_image = input_image.convert("RGB")
     image = pad_image(init_image)  # resize to integer multiple of 32
     width, height = image.size