Maya-AI / app.py
Devakumar868's picture
Update app.py
653911d verified
raw
history blame
5.98 kB
import gradio as gr
from transformers import AutoProcessor, AutoModelForCausalLM, pipeline
import torch
import numpy as np
from pyannote.audio import Pipeline as VAD
import dac
# Load models with proper error handling
def load_models():
try:
# Ultravox via transformers (no separate package needed)
ultra_proc = AutoProcessor.from_pretrained("fixie-ai/ultravox-v0_4", trust_remote_code=True)
ultra_model = AutoModelForCausalLM.from_pretrained("fixie-ai/ultravox-v0_4", device_map="auto", torch_dtype=torch.float16, trust_remote_code=True)
# Speech emotion recognition via transformers pipeline
emotion_pipeline = pipeline("audio-classification", model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", device=0 if torch.cuda.is_available() else -1)
# Audio diffusion (using transformers instead of torch.hub for HF compatibility)
from diffusers import DiffusionPipeline
diff_pipe = DiffusionPipeline.from_pretrained("teticio/audio-diffusion-instrumental-hiphop-256")
# Descript Audio Codec
from dac.utils import load_model as load_dac_model
rvq = load_dac_model(tag="latest", model_type="44khz")
rvq.eval()
if torch.cuda.is_available():
rvq = rvq.to("cuda")
# VAD
vad = VAD.from_pretrained("pyannote/voice-activity-detection")
# Dia TTS
from dia.model import Dia
dia = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16")
return ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia
except Exception as e:
print(f"Error loading models: {e}")
return None, None, None, None, None, None, None
# Initialize models
ultra_proc, ultra_model, emotion_pipeline, diff_pipe, rvq, vad, dia = load_models()
def process_audio(audio):
try:
if audio is None:
return None, "No audio input provided"
# Convert audio to proper format
audio_array = audio[1] if isinstance(audio, tuple) else audio["array"]
sample_rate = audio[0] if isinstance(audio, tuple) else audio["sampling_rate"]
# Ensure audio is numpy array
if torch.is_tensor(audio_array):
audio_array = audio_array.numpy()
# VAD processing
if vad is not None:
speech_segments = vad({"waveform": torch.from_numpy(audio_array).unsqueeze(0), "sample_rate": sample_rate})
# Emotion recognition
emotion_result = "neutral"
if emotion_pipeline is not None:
try:
emotion_pred = emotion_pipeline(audio_array, sampling_rate=sample_rate)
emotion_result = emotion_pred[0]["label"] if emotion_pred else "neutral"
except:
emotion_result = "neutral"
# RVQ encode/decode
if rvq is not None:
try:
audio_tensor = torch.from_numpy(audio_array).float().unsqueeze(0)
if torch.cuda.is_available():
audio_tensor = audio_tensor.to("cuda")
encoded = rvq.encode(audio_tensor)
decoded_audio = rvq.decode(encoded)
if torch.cuda.is_available():
decoded_audio = decoded_audio.cpu()
audio_array = decoded_audio.squeeze().numpy()
except Exception as e:
print(f"RVQ processing error: {e}")
# Ultravox generation
response_text = "I understand your audio input."
if ultra_proc is not None and ultra_model is not None:
try:
inputs = ultra_proc(audio_array, sampling_rate=sample_rate, return_tensors="pt")
if torch.cuda.is_available():
inputs = {k: v.to("cuda") for k, v in inputs.items()}
with torch.no_grad():
outputs = ultra_model.generate(**inputs, max_new_tokens=50)
response_text = ultra_proc.decode(outputs[0], skip_special_tokens=True)
except Exception as e:
print(f"Ultravox generation error: {e}")
response_text = f"Detected emotion: {emotion_result}"
# TTS generation
output_audio = None
if dia is not None:
try:
tts_text = f"[emotion:{emotion_result}] {response_text}"
output_audio = dia.generate(tts_text)
if torch.is_tensor(output_audio):
output_audio = output_audio.cpu().numpy()
# Normalize audio
if output_audio is not None:
output_audio = output_audio / np.max(np.abs(output_audio)) * 0.95
except Exception as e:
print(f"TTS generation error: {e}")
return (sample_rate, output_audio) if output_audio is not None else None, response_text
except Exception as e:
return None, f"Processing error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Supernatural Speech AI") as demo:
gr.Markdown("# Supernatural Speech AI Agent")
gr.Markdown("Record audio to interact with the AI agent that understands emotions and responds naturally.")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(source="microphone", type="numpy", label="Record Audio")
process_btn = gr.Button("Process Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="AI Response")
text_output = gr.Textbox(label="Response Text", lines=3)
conversation_history = gr.State([])
process_btn.click(
fn=process_audio,
inputs=[audio_input],
outputs=[audio_output, text_output]
)
if __name__ == "__main__":
demo.queue(concurrency_limit=20, max_size=50).launch()