Spaces:

kevalfst
/

visionary-ai

Running

App Files Files Community

kevalfst commited on May 9

Commit

cce6ff1

verified ·

1 Parent(s): 01aaa01

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -50

app.py CHANGED Viewed

@@ -1,54 +1,93 @@
-import gradio as gr
-from transformers import Qwen2_5OmniForCausalLM, AutoProcessor
 import torch
-# Load model and processor
-model_name = "Qwen/Qwen2.5-Omni-3B"
-model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
-processor = AutoProcessor.from_pretrained(model_name)
-device = model.device
-# Function to process inputs and generate response
-def process_input(text_input, image_input=None, audio_input=None, video_input=None):
-    conversation = [
-        {"role": "user", "content": [{"text": text_input}]}
-    ]
-    if image_input:
-        conversation[0]["content"].append({"image": image_input})
-    if audio_input:
-        conversation[0]["content"].append({"audio": audio_input})
-    if video_input:
-        conversation[0]["content"].append({"video": video_input})
-    # Process conversation
-    model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device)
-    # Generate response
-    outputs = model.generate(**model_inputs, max_length=200)
-    response_text = processor.decode(outputs[0], skip_special_tokens=True)
-    # Audio output not implemented
-    response_audio = None
-    return response_text, response_audio
-# Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Qwen2.5-Omni-3B Demo")
-    with gr.Row():
-        text_input = gr.Textbox(label="Text Input")
-        image_input = gr.Image(label="Upload Image", type="filepath")
-        audio_input = gr.Audio(label="Upload Audio", type="filepath")
-        video_input = gr.Video(label="Upload Video", type="filepath")
-    submit_button = gr.Button("Submit")
-    text_output = gr.Textbox(label="Text Response")
-    audio_output = gr.Audio(label="Audio Response")
-    submit_button.click(
-        fn=process_input,
-        inputs=[text_input, image_input, audio_input, video_input],
-        outputs=[text_output, audio_output]
-    )
-# Launch the app
 demo.launch()

 import torch
+import gradio as gr
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionInstructPix2PixPipeline,
+    StableVideoDiffusionPipeline,
+    WanPipeline,
+)
+from diffusers.utils import export_to_video, load_image
+# Set dtype and device
+dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# -------- Text to Image: Stable Diffusion --------
+txt2img_pipe = StableDiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-base", torch_dtype=dtype
+)
+txt2img_pipe.to(device)
+def generate_image_from_text(prompt):
+    image = txt2img_pipe(prompt, num_inference_steps=30).images[0]
+    return image
+# -------- Image to Image: Instruct Pix2Pix --------
+pix2pix_pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
+    "timbrooks/instruct-pix2pix", torch_dtype=dtype
+)
+pix2pix_pipe.to(device)
+def generate_image_from_image_and_prompt(image, prompt):
+    result = pix2pix_pipe(prompt=prompt, image=image, num_inference_steps=10)
+    return result.images[0]
+# -------- Text to Video: Wan T2V --------
+wan_pipe = WanPipeline.from_pretrained(
+    "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16
+)
+wan_pipe.to(device)
+def generate_video_from_text(prompt):
+    frames = wan_pipe(prompt=prompt, num_frames=16).frames[0]
+    video_path = export_to_video(frames, "wan_video.mp4", fps=8)
+    return video_path
+# -------- Image to Video: Stable Video Diffusion --------
+svd_pipe = StableVideoDiffusionPipeline.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid-xt",
+    torch_dtype=dtype,
+    variant="fp16" if dtype == torch.float16 else None,
+)
+svd_pipe.to(device)
+def generate_video_from_image(image):
+    image = image.resize((1024, 576))
+    frames = svd_pipe(image, num_inference_steps=25).frames[0]
+    video_path = export_to_video(frames, "svd_video.mp4", fps=8)
+    return video_path
+# -------- Gradio Interface --------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 Multimodal Any-to-Any AI Playground")
+    with gr.Tab("Text → Image"):
+        prompt = gr.Textbox(label="Prompt")
+        output_image = gr.Image()
+        btn1 = gr.Button("Generate")
+        btn1.click(fn=generate_image_from_text, inputs=prompt, outputs=output_image)
+    with gr.Tab("Image → Image"):
+        in_image = gr.Image(label="Input Image")
+        edit_prompt = gr.Textbox(label="Edit Prompt")
+        out_image = gr.Image()
+        btn2 = gr.Button("Generate")
+        btn2.click(fn=generate_image_from_image_and_prompt, inputs=[in_image, edit_prompt], outputs=out_image)
+    with gr.Tab("Text → Video"):
+        vid_prompt = gr.Textbox(label="Prompt")
+        output_vid = gr.Video()
+        btn3 = gr.Button("Generate")
+        btn3.click(fn=generate_video_from_text, inputs=vid_prompt, outputs=output_vid)
+    with gr.Tab("Image → Video"):
+        img_input = gr.Image(label="Input Image")
+        vid_out = gr.Video()
+        btn4 = gr.Button("Animate")
+        btn4.click(fn=generate_video_from_image, inputs=img_input, outputs=vid_out)
 demo.launch()