Spaces:
Running
Running
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
import os | |
# Set model and tokenizer | |
model_name = "Qwen/Qwen2.5-Omni-3B" | |
tokenizer = AutoTokenizer.from_pretrained(model_name) | |
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") | |
# Function to process inputs and generate response | |
def process_input(text_input, image_input=None, audio_input=None): | |
inputs = {"text": text_input} | |
if image_input: | |
inputs["image"] = image_input | |
if audio_input: | |
inputs["audio"] = audio_input | |
# Tokenize inputs (simplified for demo) | |
input_ids = tokenizer.encode(inputs["text"], return_tensors="pt").to(model.device) | |
# Generate response | |
outputs = model.generate(input_ids, max_length=200) | |
response_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
# Placeholder for speech generation (requires additional setup) | |
response_audio = None # Implement speech generation if needed | |
return response_text, response_audio | |
# Gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown("# Qwen2.5-Omni-3B Demo") | |
with gr.Row(): | |
text_input = gr.Textbox(label="Text Input") | |
image_input = gr.Image(label="Upload Image") | |
audio_input = gr.Audio(label="Upload Audio") | |
submit_button = gr.Button("Submit") | |
text_output = gr.Textbox(label="Text Response") | |
audio_output = gr.Audio(label="Audio Response") | |
submit_button.click( | |
fn=process_input, | |
inputs=[text_input, image_input, audio_input], | |
outputs=[text_output, audio_output] | |
) | |
# Launch the app | |
demo.launch() |