Spaces:

eggman-poff
/

wan-api

Runtime error

App Files Files Community

wan-api / app.py

eggman-poff

Update app.py

783eee8 verified 2 months ago

raw

history blame contribute delete

3.16 kB

	import gradio as gr
	import torch
	import numpy as np
	from diffusers import AutoencoderKLWan, WanImageToVideoPipeline
	from diffusers.utils import export_to_video, load_image
	from transformers import CLIPVisionModel
	from PIL import Image
	import tempfile

	# --- Load Model ---
	model_id = "Wan-AI/Wan2.1-FLF2V-14B-720P-Diffusers"

	image_encoder = CLIPVisionModel.from_pretrained(model_id, subfolder="image_encoder", torch_dtype=torch.float32)
	vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
	pipe = WanImageToVideoPipeline.from_pretrained(
	model_id, vae=vae, image_encoder=image_encoder, torch_dtype=torch.float16
	)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	pipe.to(device)
	pipe.enable_model_cpu_offload()

	# --- Helper Functions ---
	def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
	aspect_ratio = image.height / image.width
	mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size
	height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
	width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
	image = image.resize((width, height))
	return image, height, width

	def center_crop_resize(image, height, width):
	import torchvision.transforms.functional as TF
	resize_ratio = max(width / image.width, height / image.height)
	width = round(image.width * resize_ratio)
	height = round(image.height * resize_ratio)
	size = [width, height]
	image = TF.center_crop(image, size)
	return image, height, width

	# --- Gradio Inference Function ---
	def infer(first_image, last_image, prompt, guidance=5.5, frames=25):
	# Convert to PIL
	if not isinstance(first_image, Image.Image):
	first_image = Image.fromarray(first_image)
	if not isinstance(last_image, Image.Image):
	last_image = Image.fromarray(last_image)

	# Resize/crop as needed
	first_image, height, width = aspect_ratio_resize(first_image, pipe)
	if last_image.size != first_image.size:
	last_image, _, _ = center_crop_resize(last_image, height, width)

	# Run pipeline
	output = pipe(
	image=[first_image, last_image],
	prompt=prompt,
	height=height,
	width=width,
	guidance_scale=guidance,
	num_frames=frames,
	).frames

	# Export to video
	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmp:
	export_to_video(output, tmp.name, fps=16)
	return tmp.name

	# --- Gradio Interface ---
	demo = gr.Interface(
	fn=infer,
	inputs=[
	gr.Image(type="pil", label="Start Frame"),
	gr.Image(type="pil", label="End Frame"),
	gr.Textbox(placeholder="Prompt (optional)", label="Prompt"),
	gr.Slider(3, 12, value=5.5, step=0.1, label="Guidance Scale"),
	gr.Slider(8, 48, value=25, step=1, label="Num Frames"),
	],
	outputs=gr.Video(label="Generated Video"),
	title="WAN Two-Frame Video Interpolation",
	description="Upload two images and (optionally) a prompt to create a smooth video transition."
	)

	if __name__ == "__main__":
	demo.launch(show_api=True, delete_cache=(60, 60))