# tabs/audio_transcription_tab.py - Audio Transcription Tab Component import asyncio import json import gradio as gr from utils.audio_utils import load_audio_info, format_time from utils.transcription_utils import transcribe def update_transcription_info(audio_file): """This should not be used by agents, only for UI updates""" if audio_file is None: return "No file uploaded", "Ready to transcribe" audio_data, sample_rate, duration = load_audio_info(audio_file) if audio_data is None: return "❌ Could not read audio file", "File error" duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)" status_text = f"🎵 Sample rate: {sample_rate:,} Hz | Ready for transcription" return duration_text, status_text def format_transcription_segments(segments): """Format transcription segments with timestamps""" if not segments: return "No segments found" formatted_text = "" for i, segment in enumerate(segments): start_time = segment.get('start', 0) end_time = segment.get('end', 0) text = segment.get('text', '').strip() if text: formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n" formatted_text += f"{text}\n\n" return formatted_text def format_word_level_transcription(segments): """Format word-level transcription with confidence scores""" if not segments: return "No word-level data available" formatted_text = "" for segment in segments: words = segment.get('words', []) if words: for word in words: word_text = word.get('word', '') confidence = word.get('score', 0) start_time = word.get('start', 0) # Color code based on confidence if confidence > 0.9: color = "green" elif confidence > 0.7: color = "orange" else: color = "red" formatted_text += f'{word_text} ' formatted_text += "\n\n" return formatted_text def format_json_for_display(transcription_data): """Format transcription data as pretty JSON string""" return json.dumps(transcription_data, indent=2, ensure_ascii=False) async def process_transcription(audio_file): """Process audio transcription""" if audio_file is None: return "Please upload an audio file first.", "", "", "" try: # Read audio file as bytes with open(audio_file, 'rb') as f: audio_bytes = f.read() # Call transcription API transcription_result = await transcribe(audio_bytes) # Extract information full_text = transcription_result.get('full_text', '') segments = transcription_result.get('segments', []) language = transcription_result.get('language_detected', 'Unknown') processing_time = transcription_result.get('processing_time_seconds', 0) # Format results status = f"✅ Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s" # Create formatted outputs segments_formatted = format_transcription_segments(segments) # Format JSON for display json_formatted = format_json_for_display(transcription_result) return status, full_text, segments_formatted, json_formatted except Exception as e: return f"❌ Error during transcription: {str(e)}", "", "", "" def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]: """Synchronously transcribe an audio file using AI-powered speech recognition. This function provides a synchronous wrapper around the async transcription process, converting audio files to text using advanced speech recognition. It handles the async/await complexity internally and returns detailed transcription results including the full text, timestamped segments, language detection, and processing statistics. Args: audio_file (str): Full URL to the input audio file to be transcribed (supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats) Returns: tuple: A tuple containing four string elements: - status (str): Status message indicating success with language and processing time, or error information if transcription failed - full_text (str): Complete transcription as plain text, or empty string on error - segments_formatted (str): Formatted text showing timestamped segments with start/end times and confidence scores, or empty string on error - json_formatted (str): Pretty-formatted JSON string containing complete transcription data including word-level timestamps and metadata, or empty string on error. The JSON structure includes: * "filename": original audio filename * "language_detected": detected language code (e.g., "en", "es", "fr") * "full_text": complete transcription text * "segments": array of text segments with timing and word breakdowns * "processing_time_seconds": time taken for transcription Each segment contains: start/end times, text, and words array with individual word timestamps and confidence scores (0.0-1.0 range) Example: status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3") if "✅" in status: print(f"Success: {status}") print(f"Transcription: {text}") print(f"Segments: {segments}") else: print(f"Error: {status}") Note: - Automatically detects language in the audio file - Provides word-level and segment-level timestamps for precise audio editing - Returns confidence scores for quality assessment - Handles various audio formats and sample rates automatically - Processing time depends on audio length and complexity - All timestamps are provided in seconds with decimal precision - Function blocks until transcription is complete (synchronous) - For async usage, use process_transcription() directly instead """ try: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) result = loop.run_until_complete( process_transcription(audio_file) ) loop.close() return result except Exception as e: return f"❌ Error: {str(e)}", "", "", "" def create_audio_transcription_tab(): """Create the audio transcription tab interface""" gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.") gr.Markdown("**Powered by Modal Labs**") gr.Image( value="assets/modal-logo.png", show_label=False, container=False, show_fullscreen_button=False, show_download_button=False, width=200, height=200 ) with gr.Row(): with gr.Column(scale=2): # File upload audio_input = gr.Audio( label="📤 Upload Audio File", type="filepath" ) # Audio info duration_info = gr.Markdown("No file uploaded") status_info = gr.Markdown("Ready to transcribe") # Transcribe button transcribe_btn = gr.Button("🎤 Start Transcription", variant="primary", size="lg") # Status message status_msg = gr.Markdown("") # Results section with gr.Row(): with gr.Column(): # Full transcription full_text_output = gr.Textbox( label="📝 Full Transcription", lines=10, max_lines=20, placeholder="Transcription will appear here..." ) with gr.Column(): # Segmented transcription with timestamps segments_output = gr.Markdown( label="⏱️ Timestamped Segments", value="Segments with timestamps will appear here..." ) # JSON Results section with gr.Row(): with gr.Column(): gr.Markdown("### 📄 JSON Results") json_output = gr.Textbox( label="Complete JSON Data", lines=15, max_lines=25, placeholder="JSON transcription data will appear here...", show_copy_button=True ) # Event handlers audio_input.change( fn=update_transcription_info, inputs=[audio_input], outputs=[duration_info, status_info] ) transcribe_btn.click( fn=transcribe_audio_sync, inputs=[audio_input], outputs=[status_msg, full_text_output, segments_output, json_output] ) # Usage tips with gr.Accordion("📋 Transcription Guide", open=False): gr.Markdown(""" **🎤 Supported Features:** - **Multiple Languages**: Automatic language detection - **High Accuracy**: Professional-grade transcription - **Word Timestamps**: Precise timing for each word - **Confidence Scores**: Quality indicators for each word - **JSON Output**: Complete structured data **📁 File Requirements:** - **Formats**: MP3, WAV, M4A, FLAC, OGG, and more - **Duration**: Best results with files under 10 minutes - **Quality**: Clear audio produces better quality results **💡 Tips:** - Use high-quality audio for best results - Consider splitting long files into segments - Copy JSON data using the copy button for easy access - JSON contains all metadata including word-level timestamps **📊 JSON Structure:** - **full_text**: Complete transcription text - **segments**: Timestamped text segments - **language_detected**: Detected language code - **processing_time_seconds**: API processing duration """)