Spaces:

Agents-MCP-Hackathon
/

AudioEditor

Running

File size: 10,356 Bytes

0a0ea7b

# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
import asyncio
import json

import gradio as gr

from utils.audio_utils import load_audio_info, format_time
from utils.transcription_utils import transcribe


def update_transcription_info(audio_file):
    """This should not be used by agents, only for UI updates"""
    if audio_file is None:
        return "No file uploaded", "Ready to transcribe"

    audio_data, sample_rate, duration = load_audio_info(audio_file)

    if audio_data is None:
        return "❌ Could not read audio file", "File error"

    duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
    status_text = f"🎵 Sample rate: {sample_rate:,} Hz | Ready for transcription"

    return duration_text, status_text


def format_transcription_segments(segments):
    """Format transcription segments with timestamps"""
    if not segments:
        return "No segments found"

    formatted_text = ""
    for i, segment in enumerate(segments):
        start_time = segment.get('start', 0)
        end_time = segment.get('end', 0)
        text = segment.get('text', '').strip()

        if text:
            formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
            formatted_text += f"{text}\n\n"

    return formatted_text


def format_word_level_transcription(segments):
    """Format word-level transcription with confidence scores"""
    if not segments:
        return "No word-level data available"

    formatted_text = ""
    for segment in segments:
        words = segment.get('words', [])
        if words:
            for word in words:
                word_text = word.get('word', '')
                confidence = word.get('score', 0)
                start_time = word.get('start', 0)

                # Color code based on confidence
                if confidence > 0.9:
                    color = "green"
                elif confidence > 0.7:
                    color = "orange"
                else:
                    color = "red"

                formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
            formatted_text += "\n\n"

    return formatted_text


def format_json_for_display(transcription_data):
    """Format transcription data as pretty JSON string"""
    return json.dumps(transcription_data, indent=2, ensure_ascii=False)


async def process_transcription(audio_file):
    """Process audio transcription"""
    if audio_file is None:
        return "Please upload an audio file first.", "", "", ""

    try:
        # Read audio file as bytes
        with open(audio_file, 'rb') as f:
            audio_bytes = f.read()

        # Call transcription API
        transcription_result = await transcribe(audio_bytes)

        # Extract information
        full_text = transcription_result.get('full_text', '')
        segments = transcription_result.get('segments', [])
        language = transcription_result.get('language_detected', 'Unknown')
        processing_time = transcription_result.get('processing_time_seconds', 0)

        # Format results
        status = f"✅ Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"

        # Create formatted outputs
        segments_formatted = format_transcription_segments(segments)

        # Format JSON for display
        json_formatted = format_json_for_display(transcription_result)

        return status, full_text, segments_formatted, json_formatted

    except Exception as e:
        return f"❌ Error during transcription: {str(e)}", "", "", ""


def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
    """Synchronously transcribe an audio file using AI-powered speech recognition.

    This function provides a synchronous wrapper around the async transcription process,
    converting audio files to text using advanced speech recognition. It handles the
    async/await complexity internally and returns detailed transcription results including
    the full text, timestamped segments, language detection, and processing statistics.

    Args:
        audio_file (str): Full URL to the input audio file to be transcribed
                         (supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)

    Returns:
        tuple: A tuple containing four string elements:
            - status (str): Status message indicating success with language and processing time,
              or error information if transcription failed
            - full_text (str): Complete transcription as plain text, or empty string on error
            - segments_formatted (str): Formatted text showing timestamped segments with
              start/end times and confidence scores, or empty string on error
            - json_formatted (str): Pretty-formatted JSON string containing complete transcription
              data including word-level timestamps and metadata, or empty string on error.
              The JSON structure includes:
              * "filename": original audio filename
              * "language_detected": detected language code (e.g., "en", "es", "fr")
              * "full_text": complete transcription text
              * "segments": array of text segments with timing and word breakdowns
              * "processing_time_seconds": time taken for transcription
              Each segment contains: start/end times, text, and words array with individual
              word timestamps and confidence scores (0.0-1.0 range)

    Example:
        status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
        if "✅" in status:
            print(f"Success: {status}")
            print(f"Transcription: {text}")
            print(f"Segments: {segments}")
        else:
            print(f"Error: {status}")

    Note:
        - Automatically detects language in the audio file
        - Provides word-level and segment-level timestamps for precise audio editing
        - Returns confidence scores for quality assessment
        - Handles various audio formats and sample rates automatically
        - Processing time depends on audio length and complexity
        - All timestamps are provided in seconds with decimal precision
        - Function blocks until transcription is complete (synchronous)
        - For async usage, use process_transcription() directly instead
    """
    try:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        result = loop.run_until_complete(
            process_transcription(audio_file)
        )
        loop.close()
        return result
    except Exception as e:
        return f"❌ Error: {str(e)}", "", "", ""

def create_audio_transcription_tab():
    """Create the audio transcription tab interface"""

    gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
    gr.Markdown("**Powered by Modal Labs**")
    gr.Image(
        value="assets/modal-logo.png",
        show_label=False,
        container=False,
        show_fullscreen_button=False,
        show_download_button=False,
        width=200,
        height=200
    )

    with gr.Row():
        with gr.Column(scale=2):
            # File upload
            audio_input = gr.Audio(
                label="📤 Upload Audio File",
                type="filepath"
            )

            # Audio info
            duration_info = gr.Markdown("No file uploaded")
            status_info = gr.Markdown("Ready to transcribe")

            # Transcribe button
            transcribe_btn = gr.Button("🎤 Start Transcription", variant="primary", size="lg")

            # Status message
            status_msg = gr.Markdown("")

    # Results section
    with gr.Row():
        with gr.Column():
            # Full transcription
            full_text_output = gr.Textbox(
                label="📝 Full Transcription",
                lines=10,
                max_lines=20,
                placeholder="Transcription will appear here..."
            )

        with gr.Column():
            # Segmented transcription with timestamps
            segments_output = gr.Markdown(
                label="⏱️ Timestamped Segments",
                value="Segments with timestamps will appear here..."
            )

    # JSON Results section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 📄 JSON Results")
            json_output = gr.Textbox(
                label="Complete JSON Data",
                lines=15,
                max_lines=25,
                placeholder="JSON transcription data will appear here...",
                show_copy_button=True
            )

    # Event handlers
    audio_input.change(
        fn=update_transcription_info,
        inputs=[audio_input],
        outputs=[duration_info, status_info]
    )

    transcribe_btn.click(
        fn=transcribe_audio_sync,
        inputs=[audio_input],
        outputs=[status_msg, full_text_output, segments_output, json_output]
    )

    # Usage tips
    with gr.Accordion("📋 Transcription Guide", open=False):
        gr.Markdown("""
        **🎤 Supported Features:**
        - **Multiple Languages**: Automatic language detection
        - **High Accuracy**: Professional-grade transcription
        - **Word Timestamps**: Precise timing for each word
        - **Confidence Scores**: Quality indicators for each word
        - **JSON Output**: Complete structured data

        **📁 File Requirements:**
        - **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
        - **Duration**: Best results with files under 10 minutes
        - **Quality**: Clear audio produces better quality results

        **💡 Tips:**
        - Use high-quality audio for best results
        - Consider splitting long files into segments
        - Copy JSON data using the copy button for easy access
        - JSON contains all metadata including word-level timestamps

        **📊 JSON Structure:**
        - **full_text**: Complete transcription text
        - **segments**: Timestamped text segments
        - **language_detected**: Detected language code
        - **processing_time_seconds**: API processing duration
        """)