Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added method comments #287

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ new checkpoints. The following list provides an overview of all currently availa

*Stable UnCLIP 2.1*

- New stable diffusion finetune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).
- New stable diffusion fine-tune (_Stable unCLIP 2.1_, [Hugging Face](https://huggingface.co/stabilityai/)) at 768x768 resolution, based on SD2.1-768. This model allows for image variations and mixing operations as described in [*Hierarchical Text-Conditional Image Generation with CLIP Latents*](https://arxiv.org/abs/2204.06125), and, thanks to its modularity, can be combined with other models such as [KARLO](https://github.com/kakaobrain/karlo). Comes in two variants: [*Stable unCLIP-L*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-l.ckpt) and [*Stable unCLIP-H*](https://huggingface.co/stabilityai/stable-diffusion-2-1-unclip/blob/main/sd21-unclip-h.ckpt), which are conditioned on CLIP ViT-L and ViT-H image embeddings, respectively. Instructions are available [here](doc/UNCLIP.MD).

- A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)

Expand Down
2 changes: 2 additions & 0 deletions ldm/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@


def autocast(f):
""" Decorator for autocasting inside a function """
def do_autocast(*args, **kwargs):
with torch.cuda.amp.autocast(enabled=True,
dtype=torch.get_autocast_gpu_dtype(),
Expand All @@ -19,6 +20,7 @@ def do_autocast(*args, **kwargs):


def log_txt_as_img(wh, xc, size=10):
""" Convert a list of strings to a list of images """
# wh a tuple of (width, height)
# xc a list of captions to plot
b = len(xc)
Expand Down
5 changes: 5 additions & 0 deletions scripts/gradio/depth2img.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
Expand All @@ -35,6 +36,7 @@ def make_batch_sd(
num_samples=1,
model_type="dpt_hybrid"
):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
# sample['jpg'] is tensor hwc in [-1, 1] at this point
Expand All @@ -54,6 +56,7 @@ def make_batch_sd(

def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
do_full_sample=False):
""" Paint image from text prompt. """
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
Expand Down Expand Up @@ -113,6 +116,7 @@ def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=No


def pad_image(input_image):
""" Pad image to integer multiple of 32. """
pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray(
Expand All @@ -121,6 +125,7 @@ def pad_image(input_image):


def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
""" Predict image from text prompt. """
init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32

Expand Down
5 changes: 5 additions & 0 deletions scripts/gradio/inpainting.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@


def put_watermark(img, wm_encoder=None):
""" Put watermark on image. """
if wm_encoder is not None:
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img = wm_encoder.encode(img, 'dwtDct')
Expand All @@ -25,6 +26,7 @@ def put_watermark(img, wm_encoder=None):


def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)

Expand All @@ -44,6 +46,7 @@ def make_batch_sd(
txt,
device,
num_samples=1):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB"))
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
Expand All @@ -67,6 +70,7 @@ def make_batch_sd(


def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
""" Inpaint image with prompt. """
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
Expand Down Expand Up @@ -135,6 +139,7 @@ def pad_image(input_image):
return im_padded

def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
""" Predict with prompt. """
init_image = input_image["image"].convert("RGB")
init_mask = input_image["mask"].convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32
Expand Down
6 changes: 6 additions & 0 deletions scripts/gradio/superresolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@


def initialize_model(config, ckpt):
""" Initialize model from config and checkpoint. """
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
Expand All @@ -35,6 +36,7 @@ def make_batch_sd(
device,
num_samples=1,
):
""" Make batch for sampling from image and text. """
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
batch = {
Expand All @@ -47,13 +49,15 @@ def make_batch_sd(


def make_noise_augmentation(model, batch, noise_level=None):
""" Make noise augmentation for low scale model. """
x_low = batch[model.low_scale_key]
x_low = x_low.to(memory_format=torch.contiguous_format).float()
x_aug, noise_level = model.low_scale_model(x_low, noise_level)
return x_aug, noise_level


def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
""" Paint image from text prompt. """
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
Expand Down Expand Up @@ -120,6 +124,7 @@ def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callb


def pad_image(input_image):
""" Pad image to integer multiple of 32. """
pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray(
Expand All @@ -128,6 +133,7 @@ def pad_image(input_image):


def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
""" Predict image from text prompt. """
init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32
width, height = image.size
Expand Down