Spaces:

reecursion
/

task-oriented-dialog-agent

Build error

App Files Files Community

task-oriented-dialog-agent / speech_conversation_app.py

reecursion

Create speech_conversation_app.py

df90a53 verified 4 months ago

raw

history blame

10.4 kB

	import os
	import time
	import numpy as np
	import torch
	import gradio as gr
	from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer, AutoProcessor, AutoModelForSpeechSeq2Seq
	from datasets import load_dataset
	import soundfile as sf

	# Global variables to track latency
	latency_ASR = 0.0
	latency_LLM = 0.0
	latency_TTS = 0.0

	# Global variables to store conversation state
	conversation_history = []
	audio_output = None

	# ASR Models
	ASR_OPTIONS = {
	"Whisper Small": "openai/whisper-small",
	"Wav2Vec2": "facebook/wav2vec2-base-960h"
	}

	# LLM Models
	LLM_OPTIONS = {
	"Llama-2 7B Chat": "meta-llama/Llama-2-7b-chat-hf",
	"Flan-T5 Small": "google/flan-t5-small"
	}

	# TTS Models
	TTS_OPTIONS = {
	"VITS": "espnet/kan-bayashi_ljspeech_vits",
	"FastSpeech2": "espnet/kan-bayashi_ljspeech_fastspeech2"
	}

	# Load models
	asr_models = {}
	llm_models = {}
	tts_models = {}

	def load_asr_model(model_name):
	"""Load ASR model from Hugging Face"""
	global asr_models

	if model_name not in asr_models:
	print(f"Loading ASR model: {model_name}")
	model_id = ASR_OPTIONS[model_name]

	if "whisper" in model_id:
	asr_models[model_name] = pipeline("automatic-speech-recognition", model=model_id)
	else: # wav2vec2
	processor = AutoProcessor.from_pretrained(model_id)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)
	asr_models[model_name] = {"processor": processor, "model": model}

	return asr_models[model_name]

	def load_llm_model(model_name):
	"""Load LLM model from Hugging Face"""
	global llm_models

	if model_name not in llm_models:
	print(f"Loading LLM model: {model_name}")
	model_id = LLM_OPTIONS[model_name]

	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=torch.float16,
	device_map="auto"
	)

	llm_models[model_name] = {
	"model": model,
	"tokenizer": tokenizer
	}

	return llm_models[model_name]

	def load_tts_model(model_name):
	"""Load TTS model using ESPnet"""
	global tts_models

	if model_name not in tts_models:
	print(f"Loading TTS model: {model_name}")
	try:
	# Import ESPnet TTS modules
	from espnet2.bin.tts_inference import Text2Speech

	model_id = TTS_OPTIONS[model_name]
	tts = Text2Speech.from_pretrained(model_id)
	tts_models[model_name] = tts

	except ImportError:
	print("ESPnet not installed. Using mock TTS for demonstration.")
	tts_models[model_name] = "mock_tts"

	return tts_models[model_name]

	def transcribe_audio(audio_data, sr, asr_model_name):
	"""Transcribe audio using selected ASR model"""
	global latency_ASR

	start_time = time.time()

	model = load_asr_model(asr_model_name)

	if "whisper" in ASR_OPTIONS[asr_model_name]:
	result = model({"array": audio_data, "sampling_rate": sr})
	transcript = result["text"]
	else: # wav2vec2
	inputs = model["processor"](audio_data, sampling_rate=sr, return_tensors="pt")
	with torch.no_grad():
	outputs = model["model"].generate(**inputs)
	transcript = model["processor"].batch_decode(outputs, skip_special_tokens=True)[0]

	latency_ASR = time.time() - start_time
	return transcript

	def generate_response(transcript, llm_model_name, system_prompt):
	"""Generate response using selected LLM model"""
	global latency_LLM, conversation_history

	start_time = time.time()

	model_info = load_llm_model(llm_model_name)
	model = model_info["model"]
	tokenizer = model_info["tokenizer"]

	# Format the prompt based on the model
	if "llama" in LLM_OPTIONS[llm_model_name].lower():
	# Format for Llama models
	if not conversation_history:
	conversation_history.append({"role": "system", "content": system_prompt})

	conversation_history.append({"role": "user", "content": transcript})

	prompt = tokenizer.apply_chat_template(
	conversation_history,
	tokenize=False,
	add_generation_prompt=True
	)
	else:
	# Format for T5 models
	prompt = f"{system_prompt}\nUser: {transcript}\nAssistant:"

	# Generate text
	input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

	with torch.no_grad():
	outputs = model.generate(
	input_ids,
	max_new_tokens=100,
	temperature=0.7,
	top_p=0.9,
	)

	# Decode the response
	if "llama" in LLM_OPTIONS[llm_model_name].lower():
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Extract just the assistant's response
	response = response.split("Assistant: ")[-1].strip()
	else:
	response = tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Add to conversation history
	conversation_history.append({"role": "assistant", "content": response})

	latency_LLM = time.time() - start_time
	return response

	def synthesize_speech(text, tts_model_name):
	"""Synthesize speech using selected TTS model"""
	global latency_TTS

	start_time = time.time()

	tts = load_tts_model(tts_model_name)

	if tts == "mock_tts":
	# Mock TTS response for demonstration
	# In a real implementation, this would use the ESPnet model
	# Load a sample audio file for demonstration
	try:
	sample_rate = 16000
	# Generate a simple sine wave as demo audio
	duration = 2 # seconds
	t = np.linspace(0, duration, int(sample_rate * duration), endpoint=False)
	audio_data = 0.5 * np.sin(2 * np.pi * 220 * t) # 220 Hz sine wave
	except Exception as e:
	print(f"Error generating mock audio: {e}")
	audio_data = np.zeros(16000) # 1 second of silence
	sample_rate = 16000
	else:
	# Use actual ESPnet TTS model
	with torch.no_grad():
	wav = tts(text)["wav"]
	audio_data = wav.numpy()
	sample_rate = tts.fs

	latency_TTS = time.time() - start_time
	return (sample_rate, audio_data)

	def process_speech(
	audio_input,
	asr_option,
	llm_option,
	tts_option,
	system_prompt
	):
	"""Process speech: ASR -> LLM -> TTS pipeline"""
	global audio_output

	# Check if audio input is available
	if audio_input is None:
	return None, "", "", None

	# Get audio data
	sr, audio_data = audio_input

	# ASR: Speech to text
	transcript = transcribe_audio(audio_data, sr, asr_option)

	# LLM: Generate response
	response = generate_response(transcript, llm_option, system_prompt)

	# TTS: Text to speech
	audio_output = synthesize_speech(response, tts_option)

	# Return results
	return audio_input, transcript, response, audio_output

	def display_latency():
	"""Display latency information"""
	return f"""
	ASR Latency: {latency_ASR:.2f} seconds
	LLM Latency: {latency_LLM:.2f} seconds
	TTS Latency: {latency_TTS:.2f} seconds
	Total Latency: {latency_ASR + latency_LLM + latency_TTS:.2f} seconds
	"""

	def reset_conversation():
	"""Reset the conversation history"""
	global conversation_history, audio_output
	conversation_history = []
	audio_output = None
	return None, "", "", None, ""

	# Create Gradio interface
	with gr.Blocks(title="Conversational Speech System") as demo:
	gr.Markdown(
	"""
	# Conversational Speech System with ASR, LLM, and TTS

	This demo showcases a complete speech-to-speech conversation system using:
	- ASR (Automatic Speech Recognition) to convert your speech to text
	- LLM (Large Language Model) to generate responses
	- TTS (Text-to-Speech) to convert the responses to speech

	Speak into your microphone and the system will respond with synthesized speech.
	"""
	)

	with gr.Row():
	with gr.Column(scale=1):
	# Input components
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Speak here",
	)

	system_prompt = gr.Textbox(
	label="System Prompt (instructions for the LLM)",
	value="You are a helpful and friendly AI assistant. Keep your responses concise and under 3 sentences."
	)

	asr_dropdown = gr.Dropdown(
	choices=list(ASR_OPTIONS.keys()),
	value=list(ASR_OPTIONS.keys())[0],
	label="Select ASR Model"
	)

	llm_dropdown = gr.Dropdown(
	choices=list(LLM_OPTIONS.keys()),
	value=list(LLM_OPTIONS.keys())[0],
	label="Select LLM Model"
	)

	tts_dropdown = gr.Dropdown(
	choices=list(TTS_OPTIONS.keys()),
	value=list(TTS_OPTIONS.keys())[0],
	label="Select TTS Model"
	)

	reset_btn = gr.Button("Reset Conversation")

	with gr.Column(scale=1):
	# Output components
	user_transcript = gr.Textbox(label="Your Speech (ASR Output)")
	system_response = gr.Textbox(label="AI Response (LLM Output)")
	audio_output_component = gr.Audio(label="AI Voice Response", autoplay=True)
	latency_info = gr.Textbox(label="Performance Metrics")

	# Set up event handlers
	audio_input.change(
	process_speech,
	inputs=[audio_input, asr_dropdown, llm_dropdown, tts_dropdown, system_prompt],
	outputs=[audio_input, user_transcript, system_response, audio_output_component]
	).then(
	display_latency,
	inputs=[],
	outputs=[latency_info]
	)

	reset_btn.click(
	reset_conversation,
	inputs=[],
	outputs=[audio_input, user_transcript, system_response, audio_output_component, latency_info]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()