Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ability to zoom/scroll init images #189

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 146 additions & 6 deletions scripts/modelscope/process_modelscope.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,122 @@
from modules import shared, sd_hijack, lowvram
from modules.shared import opts, devices, state
import os
import json

from skimage.morphology import binary_dilation, square

pipe = None

def setup_pipeline():
return TextToVideoSynthesis(ph.models_path + '/ModelScope/t2v')




def dilate_mask(mask, dilation_amount):
# Convert PIL image to NumPy array
mask_array = np.array(mask)

# Create a square structuring element of size equal to dilation_amount
selem = square(dilation_amount)

# Dilate the area filled with 0's
dilated_mask_array = binary_dilation(mask_array==0, selem)

# Convert dilated mask back to PIL image
# We need to invert it back because binary_dilation expands areas with True (1)
dilated_mask = Image.fromarray(np.where(dilated_mask_array, 0, 1).astype(np.uint8))

return dilated_mask

def and_masks(mask1, mask2):
# Convert PIL images to NumPy arrays
mask1_array = np.array(mask1)
mask2_array = np.array(mask2)

# Perform an "and" operation on the masks
and_mask_array = np.minimum(mask1_array, mask2_array)

# Convert the result back to a PIL image
and_mask = Image.fromarray(and_mask_array.astype(np.uint8))

return and_mask

def get_value_at_time(time_map, query_time, default_values={'x': 0, 'y': 0, 'z': 1}):
if not time_map:
return default_values

keys = sorted(time_map.keys())

# Verify if keys are numerical
if not all(isinstance(k, (int, float)) for k in keys):
raise ValueError("All keys must be numerical.")

# Retrieve dimension keys from default_values
dims = default_values.keys()

# Case 1: query_time is before the first time
if query_time <= keys[0]:
return {dim: time_map[keys[0]].get(dim, default_values[dim]) for dim in dims}

# Case 3: query_time is after the last time
if query_time >= keys[-1]:
return {dim: time_map[keys[-1]].get(dim, default_values[dim]) for dim in dims}

# Case 2: query_time is in-between times
for i in range(len(keys) - 1):
current_time = keys[i]
next_time = keys[i + 1]
if current_time <= query_time < next_time:
# Linear interpolation
interp_value = {dim: time_map[current_time].get(dim, default_values[dim])
+ ((query_time - current_time) / (next_time - current_time))
* (time_map[next_time].get(dim, default_values[dim]) - time_map[current_time].get(dim, default_values[dim]))
for dim in dims}
return interp_value

def transform_image(image, x, y, z, width, height,base_mask=None):
# Create a new blank image
new_image = Image.new('RGB', (width, height))

# Define a mask to identify valid and invalid pixels
mask = Image.new('L', (width, height), 0)

# Cut a rectangle from the input image
left = max(x, 0)
upper = max(y, 0)
right = min(x + width * z, image.width)
lower = min(y + height * z, image.height)

# Adjust width and height for the mask according to the valid cropped area
valid_width = int((right - left) / z)
valid_height = int((lower - upper) / z)

# Determine paste coordinates in the new image, considering 'z' factor
paste_x = max(0, int(-x / z))
paste_y = max(0, int(-y / z))

# Ensure that crop dimensions are positive
if valid_width > 0 and valid_height > 0:
cropped_image = image.crop((left, upper, right, lower))
cropped_image = cropped_image.resize((valid_width, valid_height))

# Paste the cropped image into the new blank image at the appropriate position
new_image.paste(cropped_image, (paste_x, paste_y))

# All pixels in the region covered by the cropped image are valid
mask.paste(Image.new('L', (valid_width, valid_height), 1), (paste_x, paste_y))

#need to and mask with cropped base_mask
if base_mask is not None:
cropped_mask = base_mask.crop((left, upper, right, lower))
cropped_mask = cropped_mask.resize((valid_width, valid_height))
mask2 = Image.new('L', (width, height), 0)
mask2.paste(cropped_mask, (paste_x, paste_y))
mask = and_masks(mask, mask2)

return new_image, mask

def process_modelscope(args_dict):
args, video_args = process_args(args_dict)

Expand Down Expand Up @@ -155,15 +265,43 @@ def process_modelscope(args_dict):

shared.state.job = f"Batch {batch + 1} out of {args.batch_count}"
# TODO: move to a separate function
if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
#if args.inpainting_frames > 0 and hasattr(args.inpainting_image, "name"):
if args.inpainting_frames > 0 and args.inpainting_image is not None:
keys = T2VAnimKeys(SimpleNamespace(**{'max_frames': args.frames, 'inpainting_weights': args.inpainting_weights}), args.seed, args.inpainting_frames)
images = []
print("Received an image for inpainting", args.inpainting_image.name)
image_masks=[]
#print("Received an image for inpainting", args.inpainting_image.name)
print("Received an image for inpainting", args.inpainting_image)

try:
zoom_sequence=json.loads(args.zoom_sequence)
#convert keys from string to int
zoom_sequence={int(k):v for k,v in zoom_sequence.items()}
except:
print("Error parsing zoom sequence",args.zoom_sequence)
zoom_sequence={}

print("Zoom sequence",zoom_sequence)

if args.inpainting_mask is not None:
base_mask = Image.open(args.inpainting_mask).convert("L")
else:
base_mask = None

for i in range(args.frames):
image = Image.open(args.inpainting_image.name).convert("RGB")
image = image.resize((args.width, args.height), Image.ANTIALIAS)
#image = Image.open(args.inpainting_image.name).convert("RGB")
image = Image.open(args.inpainting_image).convert("RGB")
#image = image.resize((args.width, args.height), Image.ANTIALIAS)
xyz=get_value_at_time(zoom_sequence,i)
#print("Zoom",xyz)
image,mask=transform_image(image, xyz['x'],xyz['y'],xyz['z'], args.width, args.height,base_mask)
array = np.array(image)
images += [array]
images += [array]
#dilate mask
mask = dilate_mask(mask, 8)
#need to downsample the mask by 8x
mask=mask.resize((args.width//8, args.height//8), Image.ANTIALIAS)
image_masks+=[mask]

images = np.stack(images) # f h w c
batches = 1
Expand Down Expand Up @@ -195,7 +333,9 @@ def process_modelscope(args_dict):
mask_weights = [keys.inpainting_weights_series[frame_idx] for frame_idx in range(args.frames)]

for i in range(args.frames):
v = mask_weights[i]
v = mask_weights[i]+(1-np.array(image_masks[i]))
v=np.clip(v,0,1)
#print(i,mask_weights[i],v,np.array(image_masks[i]))
mask[:, :, i, :, :] = v

masked_latents = image_latents * (1 - mask) + latent_noise * mask
Expand Down
20 changes: 18 additions & 2 deletions scripts/t2v_helpers/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def setup_text2video_settings_dictionary():
# TODO: make it how it's done in Deforum/WebUI, so we won't have to track individual vars
prompt, n_prompt, steps, seed, cfg_scale, width, height, eta, frames, batch_count = setup_common_values('txt2vid', d)
with gr.Accordion('img2vid', open=False):
inpainting_image = gr.File(label="Inpainting image", interactive=True, file_count="single", file_types=["image"], elem_id="inpainting_chosen_file")
inpainting_image = gr.Image(label="Inpainting image", interactive=True, type='filepath',elem_id="inpainting_chosen_file")
inpainting_mask = gr.Image(label="Inpainting mask", interactive=True, type='filepath',elem_id="inpainting_mask_file")
# TODO: should be tied to the total frame count dynamically
inpainting_frames=gr.Slider(label='inpainting frames',value=d.inpainting_frames,minimum=0, maximum=24, step=1)
with gr.Row():
Expand All @@ -80,6 +81,20 @@ def setup_text2video_settings_dictionary():
Example: `0:(0), "max_i_f/4":(1), "3*max_i_f/4":(1), "max_i_f-1":(0)` ''')
with gr.Row():
inpainting_weights = gr.Textbox(label="Inpainting weights", value=d.inpainting_weights, interactive=True)


with gr.Row():
gr.Markdown('''`zoom sequence` describes the zoom and translation for input image as a json dictionary

for example,

zoom-in `{"0":{"x":0,"y":0,"z":2.0},"12":{"x":128,"y":128,"z":1.0}}`

zoom-out `{"0": {"x": 0, "y": 0, "z": 2.0}, "12": {"x": -256, "y": -256, "z": 4.0}}`

side-scroll `{"0": {"x": 0, "y": 0, "z": 2.0}, "12":{"x": 512, "y": 0, "z": 2.0}}`''')
with gr.Row():
zoom_sequence = gr.Textbox(label="Zoom sequence", value=d.zoom_sequence, interactive=True)

def update_max_inp_frames(f, i_frames): # Show video
return gr.update(value=min(f, i_frames), maximum=f, visible=True)
Expand Down Expand Up @@ -133,7 +148,7 @@ def update_max_vid_frames(v2v_frames, sFrame): # Show video

v2v_values_names = str('''
do_vid2vid, vid2vid_frames, vid2vid_frames_path, strength,vid2vid_startFrame,
inpainting_image,inpainting_frames, inpainting_weights,
inpainting_image,inpainting_mask,inpainting_frames, inpainting_weights, zoom_sequence,
model_type''').replace("\n", "").replace("\r", "").replace(" ", "").split(',')

t2v_args_names = common_values_names + [f'{v}_v' for v in common_values_names] + v2v_values_names
Expand Down Expand Up @@ -179,6 +194,7 @@ def T2VArgs():
strength = 0.75
vid2vid_startFrame = 0
inpainting_weights = '0:(t/max_i_f), "max_i_f":(1)' # linear growth weights (as they used to be in the original variant)
zoom_sequence = '{}'#json dictionary with zoom sequence {"0":{"x":0,"y":0,"z":2.0},"12":{"x":128,"y":128,"z":1.0}}
inpainting_frames = 0
return locals()

Expand Down