import torch import gradio as gr from diffusers import ( StableDiffusionPipeline, StableDiffusionInstructPix2PixPipeline, StableVideoDiffusionPipeline, WanPipeline, ) from diffusers.utils import export_to_video, load_image # Set dtype and device dtype = torch.float16 if torch.cuda.is_available() else torch.float32 device = "cuda" if torch.cuda.is_available() else "cpu" # -------- Text to Image: Stable Diffusion -------- txt2img_pipe = StableDiffusionPipeline.from_pretrained( "stabilityai/stable-diffusion-2-1-base", torch_dtype=dtype ) txt2img_pipe.to(device) def generate_image_from_text(prompt): image = txt2img_pipe(prompt, num_inference_steps=30).images[0] return image # -------- Image to Image: Instruct Pix2Pix -------- pix2pix_pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained( "timbrooks/instruct-pix2pix", torch_dtype=dtype ) pix2pix_pipe.to(device) def generate_image_from_image_and_prompt(image, prompt): result = pix2pix_pipe(prompt=prompt, image=image, num_inference_steps=10) return result.images[0] # -------- Text to Video: Wan T2V -------- wan_pipe = WanPipeline.from_pretrained( "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16 ) wan_pipe.to(device) def generate_video_from_text(prompt): frames = wan_pipe(prompt=prompt, num_frames=16).frames[0] video_path = export_to_video(frames, "wan_video.mp4", fps=8) return video_path # -------- Image to Video: Stable Video Diffusion -------- svd_pipe = StableVideoDiffusionPipeline.from_pretrained( "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=dtype, variant="fp16" if dtype == torch.float16 else None, ) svd_pipe.to(device) def generate_video_from_image(image): image = image.resize((1024, 576)) frames = svd_pipe(image, num_inference_steps=25).frames[0] video_path = export_to_video(frames, "svd_video.mp4", fps=8) return video_path # -------- Gradio Interface -------- with gr.Blocks() as demo: gr.Markdown("# 🧠 Multimodal Any-to-Any AI Playground") with gr.Tab("Text → Image"): prompt = gr.Textbox(label="Prompt") output_image = gr.Image() btn1 = gr.Button("Generate") btn1.click(fn=generate_image_from_text, inputs=prompt, outputs=output_image) with gr.Tab("Image → Image"): in_image = gr.Image(label="Input Image") edit_prompt = gr.Textbox(label="Edit Prompt") out_image = gr.Image() btn2 = gr.Button("Generate") btn2.click(fn=generate_image_from_image_and_prompt, inputs=[in_image, edit_prompt], outputs=out_image) with gr.Tab("Text → Video"): vid_prompt = gr.Textbox(label="Prompt") output_vid = gr.Video() btn3 = gr.Button("Generate") btn3.click(fn=generate_video_from_text, inputs=vid_prompt, outputs=output_vid) with gr.Tab("Image → Video"): img_input = gr.Image(label="Input Image") vid_out = gr.Video() btn4 = gr.Button("Animate") btn4.click(fn=generate_video_from_image, inputs=img_input, outputs=vid_out) demo.launch()