kevalfst commited on
Commit
cce6ff1
·
verified ·
1 Parent(s): 01aaa01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -50
app.py CHANGED
@@ -1,54 +1,93 @@
1
- import gradio as gr
2
- from transformers import Qwen2_5OmniForCausalLM, AutoProcessor
3
  import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
- # Load model and processor
6
- model_name = "Qwen/Qwen2.5-Omni-3B"
7
- model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto")
8
- processor = AutoProcessor.from_pretrained(model_name)
9
- device = model.device
10
-
11
- # Function to process inputs and generate response
12
- def process_input(text_input, image_input=None, audio_input=None, video_input=None):
13
- conversation = [
14
- {"role": "user", "content": [{"text": text_input}]}
15
- ]
16
- if image_input:
17
- conversation[0]["content"].append({"image": image_input})
18
- if audio_input:
19
- conversation[0]["content"].append({"audio": audio_input})
20
- if video_input:
21
- conversation[0]["content"].append({"video": video_input})
22
-
23
- # Process conversation
24
- model_inputs = processor.apply_chat_template(conversation, return_tensors="pt").to(device)
25
-
26
- # Generate response
27
- outputs = model.generate(**model_inputs, max_length=200)
28
- response_text = processor.decode(outputs[0], skip_special_tokens=True)
29
-
30
- # Audio output not implemented
31
- response_audio = None
32
-
33
- return response_text, response_audio
34
-
35
- # Gradio interface
36
  with gr.Blocks() as demo:
37
- gr.Markdown("# Qwen2.5-Omni-3B Demo")
38
- with gr.Row():
39
- text_input = gr.Textbox(label="Text Input")
40
- image_input = gr.Image(label="Upload Image", type="filepath")
41
- audio_input = gr.Audio(label="Upload Audio", type="filepath")
42
- video_input = gr.Video(label="Upload Video", type="filepath")
43
- submit_button = gr.Button("Submit")
44
- text_output = gr.Textbox(label="Text Response")
45
- audio_output = gr.Audio(label="Audio Response")
46
-
47
- submit_button.click(
48
- fn=process_input,
49
- inputs=[text_input, image_input, audio_input, video_input],
50
- outputs=[text_output, audio_output]
51
- )
52
-
53
- # Launch the app
 
 
 
 
 
 
 
 
 
 
54
  demo.launch()
 
 
 
1
  import torch
2
+ import gradio as gr
3
+ from diffusers import (
4
+ StableDiffusionPipeline,
5
+ StableDiffusionInstructPix2PixPipeline,
6
+ StableVideoDiffusionPipeline,
7
+ WanPipeline,
8
+ )
9
+ from diffusers.utils import export_to_video, load_image
10
+
11
+ # Set dtype and device
12
+ dtype = torch.float16 if torch.cuda.is_available() else torch.float32
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ # -------- Text to Image: Stable Diffusion --------
16
+ txt2img_pipe = StableDiffusionPipeline.from_pretrained(
17
+ "stabilityai/stable-diffusion-2-1-base", torch_dtype=dtype
18
+ )
19
+ txt2img_pipe.to(device)
20
+
21
+ def generate_image_from_text(prompt):
22
+ image = txt2img_pipe(prompt, num_inference_steps=30).images[0]
23
+ return image
24
+
25
+
26
+ # -------- Image to Image: Instruct Pix2Pix --------
27
+ pix2pix_pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(
28
+ "timbrooks/instruct-pix2pix", torch_dtype=dtype
29
+ )
30
+ pix2pix_pipe.to(device)
31
+
32
+ def generate_image_from_image_and_prompt(image, prompt):
33
+ result = pix2pix_pipe(prompt=prompt, image=image, num_inference_steps=10)
34
+ return result.images[0]
35
+
36
+
37
+ # -------- Text to Video: Wan T2V --------
38
+ wan_pipe = WanPipeline.from_pretrained(
39
+ "Wan-AI/Wan2.1-T2V-1.3B-Diffusers", torch_dtype=torch.bfloat16
40
+ )
41
+ wan_pipe.to(device)
42
 
43
+ def generate_video_from_text(prompt):
44
+ frames = wan_pipe(prompt=prompt, num_frames=16).frames[0]
45
+ video_path = export_to_video(frames, "wan_video.mp4", fps=8)
46
+ return video_path
47
+
48
+
49
+ # -------- Image to Video: Stable Video Diffusion --------
50
+ svd_pipe = StableVideoDiffusionPipeline.from_pretrained(
51
+ "stabilityai/stable-video-diffusion-img2vid-xt",
52
+ torch_dtype=dtype,
53
+ variant="fp16" if dtype == torch.float16 else None,
54
+ )
55
+ svd_pipe.to(device)
56
+
57
+ def generate_video_from_image(image):
58
+ image = image.resize((1024, 576))
59
+ frames = svd_pipe(image, num_inference_steps=25).frames[0]
60
+ video_path = export_to_video(frames, "svd_video.mp4", fps=8)
61
+ return video_path
62
+
63
+
64
+ # -------- Gradio Interface --------
 
 
 
 
 
 
 
 
 
65
  with gr.Blocks() as demo:
66
+ gr.Markdown("# 🧠 Multimodal Any-to-Any AI Playground")
67
+
68
+ with gr.Tab("Text → Image"):
69
+ prompt = gr.Textbox(label="Prompt")
70
+ output_image = gr.Image()
71
+ btn1 = gr.Button("Generate")
72
+ btn1.click(fn=generate_image_from_text, inputs=prompt, outputs=output_image)
73
+
74
+ with gr.Tab("Image → Image"):
75
+ in_image = gr.Image(label="Input Image")
76
+ edit_prompt = gr.Textbox(label="Edit Prompt")
77
+ out_image = gr.Image()
78
+ btn2 = gr.Button("Generate")
79
+ btn2.click(fn=generate_image_from_image_and_prompt, inputs=[in_image, edit_prompt], outputs=out_image)
80
+
81
+ with gr.Tab("Text → Video"):
82
+ vid_prompt = gr.Textbox(label="Prompt")
83
+ output_vid = gr.Video()
84
+ btn3 = gr.Button("Generate")
85
+ btn3.click(fn=generate_video_from_text, inputs=vid_prompt, outputs=output_vid)
86
+
87
+ with gr.Tab("Image → Video"):
88
+ img_input = gr.Image(label="Input Image")
89
+ vid_out = gr.Video()
90
+ btn4 = gr.Button("Animate")
91
+ btn4.click(fn=generate_video_from_image, inputs=img_input, outputs=vid_out)
92
+
93
  demo.launch()