|
import gradio as gr |
|
import torch |
|
from diffusers.utils import export_to_video, load_image |
|
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline |
|
from transformers import CLIPVisionModel |
|
import numpy as np |
|
import os |
|
|
|
|
|
|
|
try: |
|
import diffusers |
|
print("diffusers is already installed.") |
|
except ImportError: |
|
print("Installing diffusers...") |
|
os.system("pip install git+https://github.com/huggingface/diffusers.git transformers accelerate") |
|
import diffusers |
|
|
|
|
|
model_id = "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers" |
|
lora_weights = "Remade/Squish" |
|
|
|
def load_models(): |
|
try: |
|
image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32) |
|
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32) |
|
pipe = WanImageToVideoPipeline.from_pretrained(model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.bfloat16) |
|
pipe.to("cuda") |
|
pipe.load_lora_weights(lora_weights) |
|
pipe.enable_model_cpu_offload() |
|
return pipe |
|
except Exception as e: |
|
print(f"Error loading models: {e}") |
|
return None |
|
|
|
|
|
|
|
pipe = load_models() |
|
|
|
def generate_video(image_url, prompt, num_frames, guidance_scale, num_inference_steps, progress=gr.Progress()): |
|
if pipe is None: |
|
return "Error: Model failed to load. Check server logs for details.", None |
|
|
|
if not image_url or not prompt: |
|
return "Error: Please provide both an image URL and a prompt.", None |
|
|
|
try: |
|
image = load_image(image_url) |
|
|
|
max_area = 480 * 832 |
|
aspect_ratio = image.height / image.width |
|
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1] |
|
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value |
|
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value |
|
image = image.resize((width, height)) |
|
|
|
|
|
output = pipe( |
|
image=image, |
|
prompt=prompt, |
|
height=height, |
|
width=width, |
|
num_frames=int(num_frames), |
|
guidance_scale=guidance_scale, |
|
num_inference_steps=int(num_inference_steps) |
|
).frames[0] |
|
|
|
export_to_video(output, "output.mp4", fps=16) |
|
return "output.mp4", "output.mp4" |
|
|
|
|
|
except Exception as e: |
|
return f"An error occurred: {e}", None |
|
|
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_video, |
|
inputs=[ |
|
gr.Image(type="filepath", label="Input Image URL (or upload)"), |
|
gr.Textbox(label="Prompt"), |
|
gr.Slider(minimum=10, maximum=100, step=1, value=81, label="Number of Frames"), |
|
gr.Slider(minimum=1, maximum=10, step=0.1, value=5.0, label="Guidance Scale"), |
|
gr.Slider(minimum=10, maximum=50, step=1, value=28, label="Inference Steps"), |
|
], |
|
outputs=[ |
|
gr.Textbox(label="Status/Error Message"), |
|
gr.Video(label="Generated Video"), |
|
], |
|
title="Wan Image-to-Video Generator", |
|
description="Generate videos from an image and a text prompt using the Wan Image-to-Video model.", |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch(server_name="0.0.0.0", server_port=7860) |