Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Apr 27

Commit

c549dab

1 Parent(s): 5b27125

Revert "Update README.md"

Browse files

This reverts commit 5b2712563bbfa23c72e8ae22c408a748ad20238c.

Files changed (5) hide show

README.md +3 -4
app.py +128 -172
app_gradio.py +0 -237
requirements.txt +2 -5
utils/tts_dia.py +1 -2

README.md CHANGED Viewed

@@ -3,11 +3,10 @@ title: TeachingAssistant
 emoji: 🚀
 colorFrom: gray
 colorTo: blue
-sdk: gradio
-sdk_version: 5.27.0
 app_file: app.py
 pinned: false
-license: apache-2.0
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 emoji: 🚀
 colorFrom: gray
 colorTo: blue
+sdk: streamlit
+sdk_version: 1.41.1
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
-"""Main entry point for the Audio Translation Web Application using Gradio
 Handles file upload, processing pipeline, and UI rendering
 """
@@ -13,225 +14,180 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-import gradio as gr
 import os
 import time
-import numpy as np
-import soundfile as sf
 from utils.stt import transcribe_audio
 from utils.translation import translate_text
-from utils.tts import get_tts_engine
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
 os.makedirs("temp/outputs", exist_ok=True)
-# CSS for styling the Gradio interface
-css = """
-.gradio-container {
-    max-width: 1200px;
-    margin: 0 auto;
-}
-.output-text {
-    font-family: monospace;
-    padding: 10px;
-    background-color: #f5f5f5;
-    border-radius: 4px;
-}
-"""
-def handle_file_processing(audio_file):
     """
     Execute the complete processing pipeline:
     1. Speech-to-Text (STT)
     2. Machine Translation
     3. Text-to-Speech (TTS)
-    Args:
-        audio_file: Tuple containing (sample_rate, audio_data)
-    Returns:
-        Tuple containing (english_text, chinese_text, output_audio)
     """
-    logger.info("Starting processing for uploaded audio")
     try:
-        # Save the uploaded audio to a temporary file
-        sr, audio_data = audio_file
-        temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
-        sf.write(temp_path, audio_data, sr)
-        logger.info(f"Saved uploaded audio to {temp_path}")
         # STT Phase
         logger.info("Beginning STT processing")
-        english_text = transcribe_audio(temp_path)
         logger.info(f"STT completed. Text length: {len(english_text)} characters")
         # Translation Phase
         logger.info("Beginning translation")
-        chinese_text = translate_text(english_text)
         logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
         # TTS Phase
         logger.info("Beginning TTS generation")
         # Initialize TTS engine with appropriate language code for Chinese
         engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
         # Generate speech and get the file path
         output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
         logger.info(f"TTS completed. Output file: {output_path}")
-        # Load the generated audio for Gradio output
-        audio_data, sr = sf.read(output_path)
-        return english_text, chinese_text, (sr, audio_data)
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
-        raise gr.Error(f"Processing Failed: {str(e)}")
-def stream_audio(chinese_text, voice, speed):
-    """
-    Stream audio in chunks for the Gradio interface
-    Args:
-        chinese_text: The Chinese text to convert to speech
-        voice: The voice to use
-        speed: The speech speed factor
-    Returns:
-        Generator yielding audio chunks
-    """
-    engine = get_tts_engine(lang_code='z')
-    # Stream the audio in chunks
-    for sample_rate, audio_chunk in engine.generate_speech_stream(
-        chinese_text,
-        voice=voice,
-        speed=speed
-    ):
-        # Create a temporary file for each chunk
-        temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
-        sf.write(temp_chunk_path, audio_chunk, sample_rate)
-        # Load the chunk for Gradio output
-        chunk_data, sr = sf.read(temp_chunk_path)
-        # Clean up the temporary chunk file
-        os.remove(temp_chunk_path)
-        yield (sr, chunk_data)
-def create_interface():
-    """
-    Create and configure the Gradio interface
-    Returns:
-        Gradio Blocks interface
-    """
-    with gr.Blocks(css=css) as interface:
-        gr.Markdown("# 🎧 High-Quality Audio Translation System")
-        gr.Markdown("Upload English Audio → Get Chinese Speech Output")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # File upload component
-                audio_input = gr.Audio(
-                    label="Upload English Audio",
-                    type="numpy",
-                    sources=["upload", "microphone"]
-                )
-                # Process button
-                process_btn = gr.Button("Process Audio", variant="primary")
-            with gr.Column(scale=1):
-                # TTS Settings
-                # Changed from gr.Box() to direct placement in column
-                gr.Markdown("### TTS Settings")
-                voice_dropdown = gr.Dropdown(
-                    choices=["Xiaobei (Female)", "Yunjian (Male)"],
-                    value="Xiaobei (Female)",
-                    label="Select Voice"
-                )
-                speed_slider = gr.Slider(
-                    minimum=0.5,
-                    maximum=2.0,
-                    value=1.0,
-                    step=0.1,
-                    label="Speech Speed"
-                )
-        # Output section
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Text outputs
-                english_output = gr.Textbox(
-                    label="Recognition Results",
-                    lines=5,
-                    elem_classes=["output-text"]
-                )
-                chinese_output = gr.Textbox(
-                    label="Translation Results",
-                    lines=5,
-                    elem_classes=["output-text"]
-                )
-            with gr.Column(scale=1):
-                # Audio output
-                audio_output = gr.Audio(
-                    label="Audio Output",
-                    type="numpy"
-                )
-                # Stream button
-                stream_btn = gr.Button("Stream Audio")
-                # Download button is automatically provided by gr.Audio
-        # Set up event handlers
-        process_btn.click(
-            fn=handle_file_processing,
-            inputs=[audio_input],
-            outputs=[english_output, chinese_output, audio_output]
-        )
-        # Map voice selection to actual voice IDs
-        def get_voice_id(voice_name):
-            voice_map = {
-                "Xiaobei (Female)": "zf_xiaobei",
-                "Yunjian (Male)": "zm_yunjian"
-            }
-            return voice_map.get(voice_name, "zf_xiaobei")
-        # Stream button handler
-        stream_btn.click(
-            fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
-            inputs=[chinese_output, voice_dropdown, speed_slider],
-            outputs=audio_output
-        )
-        # Examples
-        gr.Examples(
-            examples=[
-                ["examples/sample1.mp3"],
-                ["examples/sample2.wav"]
-            ],
-            inputs=audio_input
-        )
-    return interface
 def main():
-    """
-    Main application entry point
-    """
-    logger.info("Starting Gradio application")
-    interface = create_interface()
-    interface.launch()
 if __name__ == "__main__":
     main()

+"""
+Main entry point for the Audio Translation Web Application
 Handles file upload, processing pipeline, and UI rendering
 """
 )
 logger = logging.getLogger(__name__)
+import streamlit as st
 import os
 import time
+import subprocess
 from utils.stt import transcribe_audio
 from utils.translation import translate_text
+from utils.tts import get_tts_engine, generate_speech
 # Initialize environment configurations
 os.makedirs("temp/uploads", exist_ok=True)
 os.makedirs("temp/outputs", exist_ok=True)
+def configure_page():
+    """Set up Streamlit page configuration"""
+    logger.info("Configuring Streamlit page")
+    st.set_page_config(
+        page_title="Audio Translator",
+        page_icon="🎧",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    st.markdown("""
+        <style>
+            .reportview-container {margin-top: -2em;}
+            #MainMenu {visibility: hidden;}
+            .stDeployButton {display:none;}
+            .stAlert {padding: 20px !important;}
+        </style>
+    """, unsafe_allow_html=True)
+def handle_file_processing(upload_path):
     """
     Execute the complete processing pipeline:
     1. Speech-to-Text (STT)
     2. Machine Translation
     3. Text-to-Speech (TTS)
     """
+    logger.info(f"Starting processing for: {upload_path}")
+    progress_bar = st.progress(0)
+    status_text = st.empty()
     try:
         # STT Phase
         logger.info("Beginning STT processing")
+        status_text.markdown("🔍 **Performing Speech Recognition...**")
+        with st.spinner("Initializing Whisper model..."):
+            english_text = transcribe_audio(upload_path)
+        progress_bar.progress(30)
         logger.info(f"STT completed. Text length: {len(english_text)} characters")
         # Translation Phase
         logger.info("Beginning translation")
+        status_text.markdown("🌐 **Translating Content...**")
+        with st.spinner("Loading translation model..."):
+            chinese_text = translate_text(english_text)
+        progress_bar.progress(60)
         logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
         # TTS Phase
         logger.info("Beginning TTS generation")
+        status_text.markdown("🎵 **Generating Chinese Speech...**")
         # Initialize TTS engine with appropriate language code for Chinese
         engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
         # Generate speech and get the file path
         output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
+        progress_bar.progress(100)
         logger.info(f"TTS completed. Output file: {output_path}")
+        # Store the text for streaming playback
+        st.session_state.current_text = chinese_text
+        status_text.success("✅ Processing Complete!")
+        return english_text, chinese_text, output_path
     except Exception as e:
         logger.error(f"Processing failed: {str(e)}", exc_info=True)
+        status_text.error(f"❌ Processing Failed: {str(e)}")
+        st.exception(e)
+        raise
+def render_results(english_text, chinese_text, output_path):
+    """Display processing results in organized columns"""
+    logger.info("Rendering results")
+    st.divider()
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.subheader("Recognition Results")
+        st.code(english_text, language="text")
+        st.subheader("Translation Results")
+        st.code(chinese_text, language="text")
+    with col2:
+        st.subheader("Audio Output")
+        # Standard audio player for the full file
+        st.audio(output_path)
+        # Download button
+        with open(output_path, "rb") as f:
+            st.download_button(
+                label="Download Audio",
+                data=f,
+                file_name="translated_audio.wav",
+                mime="audio/wav"
+            )
+        # Streaming playback controls
+        st.subheader("Streaming Playback")
+        if st.button("Stream Audio"):
+            engine = get_tts_engine(lang_code='z')
+            streaming_placeholder = st.empty()
+            # Stream the audio in chunks
+            for sample_rate, audio_chunk in engine.generate_speech_stream(
+                chinese_text,
+                voice="zf_xiaobei"
+            ):
+                # Create a temporary file for each chunk
+                temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
+                import soundfile as sf
+                sf.write(temp_chunk_path, audio_chunk, sample_rate)
+                # Play the chunk
+                with streaming_placeholder:
+                    st.audio(temp_chunk_path, sample_rate=sample_rate)
+                # Clean up the temporary chunk file
+                os.remove(temp_chunk_path)
+def initialize_session_state():
+    """Initialize session state variables"""
+    if 'current_text' not in st.session_state:
+        st.session_state.current_text = None
 def main():
+    """Main application workflow"""
+    logger.info("Starting application")
+    configure_page()
+    initialize_session_state()
+    st.title("🎧 High-Quality Audio Translation System")
+    st.markdown("Upload English Audio → Get Chinese Speech Output")
+    # Voice selection in sidebar
+    st.sidebar.header("TTS Settings")
+    voice_options = {
+        "Xiaobei (Female)": "zf_xiaobei",
+        "Yunjian (Male)": "zm_yunjian",
+    }
+    selected_voice = st.sidebar.selectbox(
+        "Select Voice",
+        list(voice_options.keys()),
+        format_func=lambda x: x
+    )
+    speed = st.sidebar.slider("Speech Speed", 0.5, 2.0, 1.0, 0.1)
+    uploaded_file = st.file_uploader(
+        "Select Audio File (MP3/WAV)",
+        type=["mp3", "wav"],
+        accept_multiple_files=False
+    )
+    if uploaded_file:
+        logger.info(f"File uploaded: {uploaded_file.name}")
+        upload_path = os.path.join("temp/uploads", uploaded_file.name)
+        with open(upload_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        results = handle_file_processing(upload_path)
+        if results:
+            render_results(*results)
 if __name__ == "__main__":
     main()

app_gradio.py DELETED Viewed

@@ -1,237 +0,0 @@
-"""Main entry point for the Audio Translation Web Application using Gradio
-Handles file upload, processing pipeline, and UI rendering
-"""
-import logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.FileHandler("app.log"),
-        logging.StreamHandler()
-    ]
-)
-logger = logging.getLogger(__name__)
-import gradio as gr
-import os
-import time
-import numpy as np
-import soundfile as sf
-from utils.stt import transcribe_audio
-from utils.translation import translate_text
-from utils.tts import get_tts_engine, generate_speech
-# Initialize environment configurations
-os.makedirs("temp/uploads", exist_ok=True)
-os.makedirs("temp/outputs", exist_ok=True)
-# CSS for styling the Gradio interface
-css = """
-.gradio-container {
-    max-width: 1200px;
-    margin: 0 auto;
-}
-.output-text {
-    font-family: monospace;
-    padding: 10px;
-    background-color: #f5f5f5;
-    border-radius: 4px;
-}
-"""
-def handle_file_processing(audio_file):
-    """
-    Execute the complete processing pipeline:
-    1. Speech-to-Text (STT)
-    2. Machine Translation
-    3. Text-to-Speech (TTS)
-    Args:
-        audio_file: Tuple containing (sample_rate, audio_data)
-    Returns:
-        Tuple containing (english_text, chinese_text, output_audio)
-    """
-    logger.info("Starting processing for uploaded audio")
-    try:
-        # Save the uploaded audio to a temporary file
-        sr, audio_data = audio_file
-        temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav")
-        sf.write(temp_path, audio_data, sr)
-        logger.info(f"Saved uploaded audio to {temp_path}")
-        # STT Phase
-        logger.info("Beginning STT processing")
-        english_text = transcribe_audio(temp_path)
-        logger.info(f"STT completed. Text length: {len(english_text)} characters")
-        # Translation Phase
-        logger.info("Beginning translation")
-        chinese_text = translate_text(english_text)
-        logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters")
-        # TTS Phase
-        logger.info("Beginning TTS generation")
-        # Initialize TTS engine with appropriate language code for Chinese
-        engine = get_tts_engine(lang_code='z')  # 'z' for Mandarin Chinese
-        # Generate speech and get the file path
-        output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei")
-        logger.info(f"TTS completed. Output file: {output_path}")
-        # Load the generated audio for Gradio output
-        audio_data, sr = sf.read(output_path)
-        return english_text, chinese_text, (sr, audio_data)
-    except Exception as e:
-        logger.error(f"Processing failed: {str(e)}", exc_info=True)
-        raise gr.Error(f"Processing Failed: {str(e)}")
-def stream_audio(chinese_text, voice, speed):
-    """
-    Stream audio in chunks for the Gradio interface
-    Args:
-        chinese_text: The Chinese text to convert to speech
-        voice: The voice to use
-        speed: The speech speed factor
-    Returns:
-        Generator yielding audio chunks
-    """
-    engine = get_tts_engine(lang_code='z')
-    # Stream the audio in chunks
-    for sample_rate, audio_chunk in engine.generate_speech_stream(
-        chinese_text,
-        voice=voice,
-        speed=speed
-    ):
-        # Create a temporary file for each chunk
-        temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav"
-        sf.write(temp_chunk_path, audio_chunk, sample_rate)
-        # Load the chunk for Gradio output
-        chunk_data, sr = sf.read(temp_chunk_path)
-        # Clean up the temporary chunk file
-        os.remove(temp_chunk_path)
-        yield (sr, chunk_data)
-def create_interface():
-    """
-    Create and configure the Gradio interface
-    Returns:
-        Gradio Blocks interface
-    """
-    with gr.Blocks(css=css) as interface:
-        gr.Markdown("# 🎧 High-Quality Audio Translation System")
-        gr.Markdown("Upload English Audio → Get Chinese Speech Output")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # File upload component
-                audio_input = gr.Audio(
-                    label="Upload English Audio",
-                    type="numpy",
-                    sources=["upload", "microphone"]
-                )
-                # Process button
-                process_btn = gr.Button("Process Audio", variant="primary")
-            with gr.Column(scale=1):
-                # TTS Settings
-                with gr.Box():
-                    gr.Markdown("### TTS Settings")
-                    voice_dropdown = gr.Dropdown(
-                        choices=["Xiaobei (Female)", "Yunjian (Male)"],
-                        value="Xiaobei (Female)",
-                        label="Select Voice"
-                    )
-                    speed_slider = gr.Slider(
-                        minimum=0.5,
-                        maximum=2.0,
-                        value=1.0,
-                        step=0.1,
-                        label="Speech Speed"
-                    )
-        # Output section
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Text outputs
-                english_output = gr.Textbox(
-                    label="Recognition Results",
-                    lines=5,
-                    elem_classes=["output-text"]
-                )
-                chinese_output = gr.Textbox(
-                    label="Translation Results",
-                    lines=5,
-                    elem_classes=["output-text"]
-                )
-            with gr.Column(scale=1):
-                # Audio output
-                audio_output = gr.Audio(
-                    label="Audio Output",
-                    type="numpy"
-                )
-                # Stream button
-                stream_btn = gr.Button("Stream Audio")
-                # Download button is automatically provided by gr.Audio
-        # Set up event handlers
-        process_btn.click(
-            fn=handle_file_processing,
-            inputs=[audio_input],
-            outputs=[english_output, chinese_output, audio_output]
-        )
-        # Map voice selection to actual voice IDs
-        def get_voice_id(voice_name):
-            voice_map = {
-                "Xiaobei (Female)": "zf_xiaobei",
-                "Yunjian (Male)": "zm_yunjian"
-            }
-            return voice_map.get(voice_name, "zf_xiaobei")
-        # Stream button handler
-        stream_btn.click(
-            fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed),
-            inputs=[chinese_output, voice_dropdown, speed_slider],
-            outputs=audio_output
-        )
-        # Examples
-        gr.Examples(
-            examples=[
-                ["examples/sample1.mp3"],
-                ["examples/sample2.wav"]
-            ],
-            inputs=audio_input
-        )
-    return interface
-def main():
-    """
-    Main application entry point
-    """
-    logger.info("Starting Gradio application")
-    interface = create_interface()
-    interface.launch()
-if __name__ == "__main__":
-    main()

requirements.txt CHANGED Viewed

@@ -8,11 +8,8 @@ torchaudio>=2.1.0
 scipy>=1.11
 munch>=2.5
 accelerate>=1.2.0
-soundfile>=0.13.1
 kokoro>=0.9.4
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
-descript-audio-codec
-gradio>=5.25.2
-gradio-dialogue>=0.0.4
-huggingface-hub>=0.30.2

 scipy>=1.11
 munch>=2.5
 accelerate>=1.2.0
+soundfile>=0.13.0
 kokoro>=0.9.4
 ordered-set>=4.1.0
 phonemizer-fork>=3.3.2
+descript-audio-codec

utils/tts_dia.py CHANGED Viewed

@@ -6,7 +6,6 @@ import numpy as np
 import soundfile as sf
 from pathlib import Path
 from typing import Optional
-import spaces
 from dia.model import Dia
@@ -65,7 +64,7 @@ def _get_model() -> Dia:
             raise
     return _model
-@spaces.GPU
 def generate_speech(text: str, language: str = "zh") -> str:
     """Public interface for TTS generation using Dia model

 import soundfile as sf
 from pathlib import Path
 from typing import Optional
 from dia.model import Dia
             raise
     return _model
 def generate_speech(text: str, language: str = "zh") -> str:
     """Public interface for TTS generation using Dia model