Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

File size: 7,383 Bytes

"""Main entry point for the Audio Translation Web Application using Gradio
Handles file upload, processing pipeline, and UI rendering
"""

import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("app.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

import gradio as gr
import os
import time
import numpy as np
import soundfile as sf
from utils.stt import transcribe_audio
from utils.translation import translate_text
from utils.tts import get_tts_engine

# Initialize environment configurations
os.makedirs("temp/uploads", exist_ok=True)
os.makedirs("temp/outputs", exist_ok=True)

# CSS for styling the Gradio interface
css = """
.gradio-container {
    max-width: 1200px;
    margin: 0 auto;
}

.output-text {
    font-family: monospace;
    padding: 10px;
    background-color: #f5f5f5;
    border-radius: 4px;
}
"""

def handle_file_processing(audio_file):
    """
    Execute the complete processing pipeline:
    1. Speech-to-Text (STT)
    2. Machine Translation
    3. Text-to-Speech (TTS)
    
    Args:
        audio_file: Tuple containing (sample_rate, audio_data)
        
    Returns:
        Tuple containing (english_text, chinese_text, output_audio)
    """
    logger.info("Starting processing for uploaded audio")
    
    try:
        # Save the uploaded audio to a temporary file
        sr, audio_data = audio_file
        temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
        sf.write(temp_path, audio_data, sr)
        logger.info(f"Saved uploaded audio to {temp_path}")
        
        # STT Phase
        logger.info("Beginning STT processing")
        english_text = transcribe_audio(temp_path)
        logger.info(f"STT completed. Text length: {len(english_text)} characters")

        # Translation Phase
        logger.info("Beginning translation")
        chinese_text = translate_text(english_text)
        logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")

        # TTS Phase
        logger.info("Beginning TTS generation")
        
        # Initialize TTS engine with appropriate language code for Chinese
        engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
        
        # Generate speech and get the file path
        output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
        logger.info(f"TTS completed. Output file: {output_path}")
        
        # Load the generated audio for Gradio output
        audio_data, sr = sf.read(output_path)
        
        return english_text, chinese_text, (sr, audio_data)
        
    except Exception as e:
        logger.error(f"Processing failed: {str(e)}", exc_info=True)
        raise gr.Error(f"Processing Failed: {str(e)}")

def stream_audio(chinese_text, voice, speed):
    """
    Stream audio in chunks for the Gradio interface
    
    Args:
        chinese_text: The Chinese text to convert to speech
        voice: The voice to use
        speed: The speech speed factor
        
    Returns:
        Generator yielding audio chunks
    """
    engine = get_tts_engine(lang_code='z')
    
    # Stream the audio in chunks
    for sample_rate, audio_chunk in engine.generate_speech_stream(
        chinese_text, 
        voice=voice,
        speed=speed
    ):
        # Create a temporary file for each chunk
        temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
        sf.write(temp_chunk_path, audio_chunk, sample_rate)
        
        # Load the chunk for Gradio output
        chunk_data, sr = sf.read(temp_chunk_path)
        
        # Clean up the temporary chunk file
        os.remove(temp_chunk_path)
        
        yield (sr, chunk_data)

def create_interface():
    """
    Create and configure the Gradio interface
    
    Returns:
        Gradio Blocks interface
    """
    with gr.Blocks(css=css) as interface:
        gr.Markdown("# 🎧 High-Quality Audio Translation System")
        gr.Markdown("Upload English Audio → Get Chinese Speech Output")
        
        with gr.Row():
            with gr.Column(scale=2):
                # File upload component
                audio_input = gr.Audio(
                    label="Upload English Audio",
                    type="numpy",
                    sources=["upload", "microphone"]
                )
                
                # Process button
                process_btn = gr.Button("Process Audio", variant="primary")
                
            with gr.Column(scale=1):
                # TTS Settings
                with gr.Box():
                    gr.Markdown("### TTS Settings")
                    voice_dropdown = gr.Dropdown(
                        choices=["Xiaobei (Female)", "Yunjian (Male)"],
                        value="Xiaobei (Female)",
                        label="Select Voice"
                    )
                    speed_slider = gr.Slider(
                        minimum=0.5,
                        maximum=2.0,
                        value=1.0,
                        step=0.1,
                        label="Speech Speed"
                    )
        
        # Output section
        with gr.Row():
            with gr.Column(scale=2):
                # Text outputs
                english_output = gr.Textbox(
                    label="Recognition Results",
                    lines=5,
                    elem_classes=["output-text"]
                )
                
                chinese_output = gr.Textbox(
                    label="Translation Results",
                    lines=5,
                    elem_classes=["output-text"]
                )
                
            with gr.Column(scale=1):
                # Audio output
                audio_output = gr.Audio(
                    label="Audio Output",
                    type="numpy"
                )
                
                # Stream button
                stream_btn = gr.Button("Stream Audio")
                
                # Download button is automatically provided by gr.Audio
        
        # Set up event handlers
        process_btn.click(
            fn=handle_file_processing,
            inputs=[audio_input],
            outputs=[english_output, chinese_output, audio_output]
        )
        
        # Map voice selection to actual voice IDs
        def get_voice_id(voice_name):
            voice_map = {
                "Xiaobei (Female)": "zf_xiaobei",
                "Yunjian (Male)": "zm_yunjian"
            }
            return voice_map.get(voice_name, "zf_xiaobei")
        
        # Stream button handler
        stream_btn.click(
            fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
            inputs=[chinese_output, voice_dropdown, speed_slider],
            outputs=audio_output
        )
        
        # Examples
        gr.Examples(
            examples=[
                ["examples/sample1.mp3"],
                ["examples/sample2.wav"]
            ],
            inputs=audio_input
        )
    
    return interface

def main():
    """
    Main application entry point
    """
    logger.info("Starting Gradio application")
    interface = create_interface()
    interface.launch()

if __name__ == "__main__":
    main()