Spaces:

DroolingPanda
/

teachingAssistant

Build error

Michael Hu

move to Gradio so we can leverage ZeroGPU

f0248ed 5 months ago

7.38 kB

	"""Main entry point for the Audio Translation Web Application using Gradio
	Handles file upload, processing pipeline, and UI rendering
	"""

	import logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
	handlers=[
	logging.FileHandler("app.log"),
	logging.StreamHandler()
	]
	)
	logger = logging.getLogger(__name__)

	import gradio as gr
	import os
	import time
	import numpy as np
	import soundfile as sf
	from utils.stt import transcribe_audio
	from utils.translation import translate_text
	from utils.tts import get_tts_engine

	# Initialize environment configurations
	os.makedirs("temp/uploads", exist_ok=True)
	os.makedirs("temp/outputs", exist_ok=True)

	# CSS for styling the Gradio interface
	css = """
	.gradio-container {
	max-width: 1200px;
	margin: 0 auto;
	}

	.output-text {
	font-family: monospace;
	padding: 10px;
	background-color: #f5f5f5;
	border-radius: 4px;
	}
	"""

	def handle_file_processing(audio_file):
	"""
	Execute the complete processing pipeline:
	1. Speech-to-Text (STT)
	2. Machine Translation
	3. Text-to-Speech (TTS)

	Args:
	audio_file: Tuple containing (sample_rate, audio_data)

	Returns:
	Tuple containing (english_text, chinese_text, output_audio)
	"""
	logger.info("Starting processing for uploaded audio")

	try:
	# Save the uploaded audio to a temporary file
	sr, audio_data = audio_file
	temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
	sf.write(temp_path, audio_data, sr)
	logger.info(f"Saved uploaded audio to {temp_path}")

	# STT Phase
	logger.info("Beginning STT processing")
	english_text = transcribe_audio(temp_path)
	logger.info(f"STT completed. Text length: {len(english_text)} characters")

	# Translation Phase
	logger.info("Beginning translation")
	chinese_text = translate_text(english_text)
	logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")

	# TTS Phase
	logger.info("Beginning TTS generation")

	# Initialize TTS engine with appropriate language code for Chinese
	engine = get_tts_engine(lang_code='z') # 'z' for Mandarin Chinese

	# Generate speech and get the file path
	output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
	logger.info(f"TTS completed. Output file: {output_path}")

	# Load the generated audio for Gradio output
	audio_data, sr = sf.read(output_path)

	return english_text, chinese_text, (sr, audio_data)

	except Exception as e:
	logger.error(f"Processing failed: {str(e)}", exc_info=True)
	raise gr.Error(f"Processing Failed: {str(e)}")

	def stream_audio(chinese_text, voice, speed):
	"""
	Stream audio in chunks for the Gradio interface

	Args:
	chinese_text: The Chinese text to convert to speech
	voice: The voice to use
	speed: The speech speed factor

	Returns:
	Generator yielding audio chunks
	"""
	engine = get_tts_engine(lang_code='z')

	# Stream the audio in chunks
	for sample_rate, audio_chunk in engine.generate_speech_stream(
	chinese_text,
	voice=voice,
	speed=speed
	):
	# Create a temporary file for each chunk
	temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
	sf.write(temp_chunk_path, audio_chunk, sample_rate)

	# Load the chunk for Gradio output
	chunk_data, sr = sf.read(temp_chunk_path)

	# Clean up the temporary chunk file
	os.remove(temp_chunk_path)

	yield (sr, chunk_data)

	def create_interface():
	"""
	Create and configure the Gradio interface

	Returns:
	Gradio Blocks interface
	"""
	with gr.Blocks(css=css) as interface:
	gr.Markdown("# 🎧 High-Quality Audio Translation System")
	gr.Markdown("Upload English Audio → Get Chinese Speech Output")

	with gr.Row():
	with gr.Column(scale=2):
	# File upload component
	audio_input = gr.Audio(
	label="Upload English Audio",
	type="numpy",
	sources=["upload", "microphone"]
	)

	# Process button
	process_btn = gr.Button("Process Audio", variant="primary")

	with gr.Column(scale=1):
	# TTS Settings
	with gr.Box():
	gr.Markdown("### TTS Settings")
	voice_dropdown = gr.Dropdown(
	choices=["Xiaobei (Female)", "Yunjian (Male)"],
	value="Xiaobei (Female)",
	label="Select Voice"
	)
	speed_slider = gr.Slider(
	minimum=0.5,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Speech Speed"
	)

	# Output section
	with gr.Row():
	with gr.Column(scale=2):
	# Text outputs
	english_output = gr.Textbox(
	label="Recognition Results",
	lines=5,
	elem_classes=["output-text"]
	)

	chinese_output = gr.Textbox(
	label="Translation Results",
	lines=5,
	elem_classes=["output-text"]
	)

	with gr.Column(scale=1):
	# Audio output
	audio_output = gr.Audio(
	label="Audio Output",
	type="numpy"
	)

	# Stream button
	stream_btn = gr.Button("Stream Audio")

	# Download button is automatically provided by gr.Audio

	# Set up event handlers
	process_btn.click(
	fn=handle_file_processing,
	inputs=[audio_input],
	outputs=[english_output, chinese_output, audio_output]
	)

	# Map voice selection to actual voice IDs
	def get_voice_id(voice_name):
	voice_map = {
	"Xiaobei (Female)": "zf_xiaobei",
	"Yunjian (Male)": "zm_yunjian"
	}
	return voice_map.get(voice_name, "zf_xiaobei")

	# Stream button handler
	stream_btn.click(
	fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
	inputs=[chinese_output, voice_dropdown, speed_slider],
	outputs=audio_output
	)

	# Examples
	gr.Examples(
	examples=[
	["examples/sample1.mp3"],
	["examples/sample2.wav"]
	],
	inputs=audio_input
	)

	return interface

	def main():
	"""
	Main application entry point
	"""
	logger.info("Starting Gradio application")
	interface = create_interface()
	interface.launch()

	if __name__ == "__main__":
	main()