Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Jul 27

Commit

f7492cb

1 Parent(s): fafafc3

Refactor presentation layer to use application services

Browse files

Files changed (1) hide show

app.py +279 -112

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 Main entry point for the Audio Translation Web Application
-Handles file upload, processing pipeline, and UI rendering
 """
 import logging
@@ -17,10 +17,17 @@ logger = logging.getLogger(__name__)
 import streamlit as st
 import os
 import time
-import subprocess
-from utils.stt import transcribe_audio
-from utils.translation import translate_text
-from utils.tts import get_tts_engine, generate_speech
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
@@ -44,162 +51,322 @@ def configure_page():
         </style>
     """, unsafe_allow_html=True)
-def handle_file_processing(upload_path, asr_model="whisper"):
     """
-    Execute the complete processing pipeline:
-    1. Speech-to-Text (STT)
-    2. Machine Translation
-    3. Text-to-Speech (TTS)
     Args:
-        upload_path: Path to the uploaded audio file
-        asr_model: ASR model to use (whisper or parakeet)
     """
-    logger.info(f"Starting processing for: {upload_path} using {asr_model} model")
     progress_bar = st.progress(0)
     status_text = st.empty()
     try:
-        # STT Phase
-        logger.info("Beginning STT processing")
         status_text.markdown("🔍 **Performing Speech Recognition...**")
-        with st.spinner(f"Initializing {asr_model.capitalize()} model..."):
-            english_text = transcribe_audio(upload_path, model_name=asr_model)
-        progress_bar.progress(30)
-        logger.info(f"STT completed. Text length: {len(english_text)} characters")
-        # Translation Phase
-        logger.info("Beginning translation")
-        status_text.markdown("🌐 **Translating Content...**")
-        with st.spinner("Loading translation model..."):
-            chinese_text = translate_text(english_text)
-        progress_bar.progress(60)
-        logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
-        # TTS Phase
-        logger.info("Beginning TTS generation")
-        status_text.markdown("🎵 **Generating Chinese Speech...**")
-        # Initialize TTS engine with appropriate language code for Chinese
-        engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
-        # Generate speech and get the file path
-        output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
-        progress_bar.progress(100)
-        logger.info(f"TTS completed. Output file: {output_path}")
-        # Store the text for streaming playback
-        st.session_state.current_text = chinese_text
-        status_text.success("✅ Processing Complete!")
-        return english_text, chinese_text, output_path
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
         status_text.error(f"❌ Processing Failed: {str(e)}")
         st.exception(e)
-        raise
-def render_results(english_text, chinese_text, output_path):
-    """Display processing results in organized columns"""
     logger.info("Rendering results")
     st.divider()
     col1, col2 = st.columns([2, 1])
     with col1:
-        st.subheader("Recognition Results")
-        st.code(english_text, language="text")
-        st.subheader("Translation Results")
-        st.code(chinese_text, language="text")
     with col2:
-        st.subheader("Audio Output")
-        # Standard audio player for the full file
-        st.audio(output_path)
-        # Download button
-        with open(output_path, "rb") as f:
-            st.download_button(
-                label="Download Audio",
-                data=f,
-                file_name="translated_audio.wav",
-                mime="audio/wav"
-            )
-        # Streaming playback controls
-        st.subheader("Streaming Playback")
-        if st.button("Stream Audio"):
-            engine = get_tts_engine(lang_code='z')
-            streaming_placeholder = st.empty()
-            # Stream the audio in chunks
-            for sample_rate, audio_chunk in engine.generate_speech_stream(
-                chinese_text,
-                voice="zf_xiaobei"
-            ):
-                # Create a temporary file for each chunk
-                temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
-                import soundfile as sf
-                sf.write(temp_chunk_path, audio_chunk, sample_rate)
-                # Play the chunk
-                with streaming_placeholder:
-                    st.audio(temp_chunk_path, sample_rate=sample_rate)
-                # Clean up the temporary chunk file
-                os.remove(temp_chunk_path)
 def initialize_session_state():
     """Initialize session state variables"""
-    if 'current_text' not in st.session_state:
-        st.session_state.current_text = None
 def main():
     """Main application workflow"""
     logger.info("Starting application")
     configure_page()
     initialize_session_state()
     st.title("🎧 High-Quality Audio Translation System")
     st.markdown("Upload English Audio → Get Chinese Speech Output")
     # Voice selection in sidebar
     st.sidebar.header("TTS Settings")
     voice_options = {
-        "Xiaobei (Female)": "zf_xiaobei",
-        "Yunjian (Male)": "zm_yunjian",
     }
-    selected_voice = st.sidebar.selectbox(
         "Select Voice",
         list(voice_options.keys()),
-        format_func=lambda x: x
     )
-    speed = st.sidebar.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
     # Model selection
     asr_model = st.selectbox(
         "Select Speech Recognition Model",
-        options=["parakeet", "whisper"],
         index=0,
         help="Choose the ASR model for speech recognition"
     )
     uploaded_file = st.file_uploader(
-        "Select Audio File (MP3/WAV)",
-        type=["mp3", "wav"],
-        accept_multiple_files=False
     )
     if uploaded_file:
         logger.info(f"File uploaded: {uploaded_file.name}")
-        upload_path = os.path.join("temp/uploads", uploaded_file.name)
-        with open(upload_path, "wb") as f:
-            f.write(uploaded_file.getbuffer())
-        results = handle_file_processing(upload_path, asr_model=asr_model)
-        if results:
-            render_results(*results)
 if __name__ == "__main__":
     main()

 """
 Main entry point for the Audio Translation Web Application
+Handles file upload, processing pipeline, and UI rendering using DDD architecture
 """
 import logging
 import streamlit as st
 import os
 import time
+from typing import Optional
+# Import application services and DTOs
+from src.application.services.audio_processing_service import AudioProcessingApplicationService
+from src.application.services.configuration_service import ConfigurationApplicationService
+from src.application.dtos.audio_upload_dto import AudioUploadDto
+from src.application.dtos.processing_request_dto import ProcessingRequestDto
+from src.application.dtos.processing_result_dto import ProcessingResultDto
+# Import infrastructure setup
+from src.infrastructure.config.container_setup import initialize_global_container, get_global_container
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
         </style>
     """, unsafe_allow_html=True)
+def create_audio_upload_dto(uploaded_file) -> AudioUploadDto:
+    """
+    Create AudioUploadDto from Streamlit uploaded file.
+    Args:
+        uploaded_file: Streamlit UploadedFile object
+    Returns:
+        AudioUploadDto: DTO containing upload information
+    """
+    try:
+        content = uploaded_file.getbuffer().tobytes()
+        # Determine content type based on file extension
+        file_ext = os.path.splitext(uploaded_file.name.lower())[1]
+        content_type_map = {
+            '.wav': 'audio/wav',
+            '.mp3': 'audio/mpeg',
+            '.m4a': 'audio/mp4',
+            '.flac': 'audio/flac',
+            '.ogg': 'audio/ogg'
+        }
+        content_type = content_type_map.get(file_ext, 'audio/wav')
+        return AudioUploadDto(
+            filename=uploaded_file.name,
+            content=content,
+            content_type=content_type,
+            size=len(content)
+        )
+    except Exception as e:
+        logger.error(f"Failed to create AudioUploadDto: {e}")
+        raise ValueError(f"Invalid audio file: {str(e)}")
+def handle_file_processing(
+    audio_upload: AudioUploadDto,
+    asr_model: str,
+    target_language: str,
+    voice: str,
+    speed: float,
+    source_language: Optional[str] = None
+) -> ProcessingResultDto:
     """
+    Execute the complete processing pipeline using application services.
     Args:
+        audio_upload: Audio upload DTO
+        asr_model: ASR model to use
+        target_language: Target language for translation
+        voice: Voice for TTS
+        speed: Speech speed
+        source_language: Source language (optional)
+    Returns:
+        ProcessingResultDto: Processing result
     """
+    logger.info(f"Starting processing for: {audio_upload.filename} using {asr_model} model")
     progress_bar = st.progress(0)
     status_text = st.empty()
     try:
+        # Get application service from container
+        container = get_global_container()
+        audio_service = container.resolve(AudioProcessingApplicationService)
+        # Create processing request
+        request = ProcessingRequestDto(
+            audio=audio_upload,
+            asr_model=asr_model,
+            target_language=target_language,
+            voice=voice,
+            speed=speed,
+            source_language=source_language
+        )
+        # Update progress and status
         status_text.markdown("🔍 **Performing Speech Recognition...**")
+        progress_bar.progress(10)
+        # Process through application service
+        with st.spinner("Processing audio pipeline..."):
+            result = audio_service.process_audio_pipeline(request)
+        if result.success:
+            progress_bar.progress(100)
+            status_text.success("✅ Processing Complete!")
+            logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
+        else:
+            status_text.error(f"❌ Processing Failed: {result.error_message}")
+            logger.error(f"Processing failed: {result.error_message}")
+        return result
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
         status_text.error(f"❌ Processing Failed: {str(e)}")
         st.exception(e)
+        # Return error result
+        return ProcessingResultDto.error_result(
+            error_message=str(e),
+            error_code='SYSTEM_ERROR'
+        )
+def render_results(result: ProcessingResultDto):
+    """
+    Display processing results using ProcessingResultDto.
+    Args:
+        result: Processing result DTO
+    """
     logger.info("Rendering results")
     st.divider()
+    if not result.success:
+        st.error(f"Processing failed: {result.error_message}")
+        if result.error_code:
+            st.code(f"Error Code: {result.error_code}")
+        return
     col1, col2 = st.columns([2, 1])
     with col1:
+        # Display original text if available
+        if result.original_text:
+            st.subheader("Recognition Results")
+            st.code(result.original_text, language="text")
+        # Display translated text if available
+        if result.translated_text:
+            st.subheader("Translation Results")
+            st.code(result.translated_text, language="text")
+        # Display processing metadata
+        if result.metadata:
+            with st.expander("Processing Details"):
+                st.json(result.metadata)
     with col2:
+        # Display audio output if available
+        if result.has_audio_output and result.audio_path:
+            st.subheader("Audio Output")
+            # Check if file exists and is accessible
+            if os.path.exists(result.audio_path):
+                # Standard audio player
+                st.audio(result.audio_path)
+                # Download button
+                try:
+                    with open(result.audio_path, "rb") as f:
+                        st.download_button(
+                            label="Download Audio",
+                            data=f,
+                            file_name="translated_audio.wav",
+                            mime="audio/wav"
+                        )
+                except Exception as e:
+                    st.warning(f"Download not available: {str(e)}")
+            else:
+                st.warning("Audio file not found or not accessible")
+        # Display processing time
+        st.metric("Processing Time", f"{result.processing_time:.2f}s")
+def get_supported_configurations() -> dict:
+    """
+    Get supported configurations from application service.
+    Returns:
+        dict: Supported configurations
+    """
+    try:
+        container = get_global_container()
+        audio_service = container.resolve(AudioProcessingApplicationService)
+        return audio_service.get_supported_configurations()
+    except Exception as e:
+        logger.warning(f"Failed to get configurations: {e}")
+        # Return fallback configurations
+        return {
+            'asr_models': ['whisper-small', 'parakeet'],
+            'voices': ['kokoro', 'dia', 'cosyvoice2', 'dummy'],
+            'languages': ['en', 'zh', 'es', 'fr', 'de'],
+            'audio_formats': ['wav', 'mp3'],
+            'max_file_size_mb': 100,
+            'speed_range': {'min': 0.5, 'max': 2.0}
+        }
 def initialize_session_state():
     """Initialize session state variables"""
+    if 'processing_result' not in st.session_state:
+        st.session_state.processing_result = None
+    if 'container_initialized' not in st.session_state:
+        st.session_state.container_initialized = False
+def initialize_application():
+    """Initialize the application with dependency injection container"""
+    if not st.session_state.container_initialized:
+        try:
+            logger.info("Initializing application container")
+            initialize_global_container()
+            st.session_state.container_initialized = True
+            logger.info("Application container initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize application: {e}")
+            st.error(f"Application initialization failed: {str(e)}")
+            st.stop()
 def main():
     """Main application workflow"""
     logger.info("Starting application")
+    # Initialize application
+    initialize_application()
+    # Configure page
     configure_page()
     initialize_session_state()
     st.title("🎧 High-Quality Audio Translation System")
     st.markdown("Upload English Audio → Get Chinese Speech Output")
+    # Get supported configurations
+    config = get_supported_configurations()
     # Voice selection in sidebar
     st.sidebar.header("TTS Settings")
+    # Map voice display names to internal IDs
     voice_options = {
+        "Kokoro": "kokoro",
+        "Dia": "dia",
+        "CosyVoice2": "cosyvoice2",
+        "Dummy (Test)": "dummy"
     }
+    selected_voice_display = st.sidebar.selectbox(
         "Select Voice",
         list(voice_options.keys()),
+        index=0
+    )
+    selected_voice = voice_options[selected_voice_display]
+    speed = st.sidebar.slider(
+        "Speech Speed",
+        config['speed_range']['min'],
+        config['speed_range']['max'],
+        1.0,
+        0.1
     )
     # Model selection
     asr_model = st.selectbox(
         "Select Speech Recognition Model",
+        options=config['asr_models'],
         index=0,
         help="Choose the ASR model for speech recognition"
     )
+    # Language selection
+    language_options = {
+        "Chinese (Mandarin)": "zh",
+        "Spanish": "es",
+        "French": "fr",
+        "German": "de",
+        "English": "en"
+    }
+    selected_language_display = st.selectbox(
+        "Target Language",
+        list(language_options.keys()),
+        index=0,
+        help="Select the target language for translation"
+    )
+    target_language = language_options[selected_language_display]
+    # File upload
     uploaded_file = st.file_uploader(
+        f"Select Audio File ({', '.join(config['audio_formats']).upper()})",
+        type=config['audio_formats'],
+        accept_multiple_files=False,
+        help=f"Maximum file size: {config['max_file_size_mb']}MB"
     )
     if uploaded_file:
         logger.info(f"File uploaded: {uploaded_file.name}")
+        try:
+            # Create audio upload DTO
+            audio_upload = create_audio_upload_dto(uploaded_file)
+            # Display file information
+            st.info(f"📁 **File:** {audio_upload.filename} ({audio_upload.size / 1024:.1f} KB)")
+            # Process button
+            if st.button("🚀 Process Audio", type="primary"):
+                # Process the audio
+                result = handle_file_processing(
+                    audio_upload=audio_upload,
+                    asr_model=asr_model,
+                    target_language=target_language,
+                    voice=selected_voice,
+                    speed=speed,
+                    source_language="en"  # Assume English source for now
+                )
+                # Store result in session state
+                st.session_state.processing_result = result
+            # Display results if available
+            if st.session_state.processing_result:
+                render_results(st.session_state.processing_result)
+        except Exception as e:
+            st.error(f"Error processing file: {str(e)}")
+            logger.error(f"File processing error: {e}")
 if __name__ == "__main__":
     main()