AudioEditor / tabs /audio_transcription_tab.py
Ahmet Emre Şafak
initial commit
0a0ea7b
# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
import asyncio
import json
import gradio as gr
from utils.audio_utils import load_audio_info, format_time
from utils.transcription_utils import transcribe
def update_transcription_info(audio_file):
"""This should not be used by agents, only for UI updates"""
if audio_file is None:
return "No file uploaded", "Ready to transcribe"
audio_data, sample_rate, duration = load_audio_info(audio_file)
if audio_data is None:
return "❌ Could not read audio file", "File error"
duration_text = f"πŸ“ File duration: {format_time(duration)} ({duration:.1f} seconds)"
status_text = f"🎡 Sample rate: {sample_rate:,} Hz | Ready for transcription"
return duration_text, status_text
def format_transcription_segments(segments):
"""Format transcription segments with timestamps"""
if not segments:
return "No segments found"
formatted_text = ""
for i, segment in enumerate(segments):
start_time = segment.get('start', 0)
end_time = segment.get('end', 0)
text = segment.get('text', '').strip()
if text:
formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
formatted_text += f"{text}\n\n"
return formatted_text
def format_word_level_transcription(segments):
"""Format word-level transcription with confidence scores"""
if not segments:
return "No word-level data available"
formatted_text = ""
for segment in segments:
words = segment.get('words', [])
if words:
for word in words:
word_text = word.get('word', '')
confidence = word.get('score', 0)
start_time = word.get('start', 0)
# Color code based on confidence
if confidence > 0.9:
color = "green"
elif confidence > 0.7:
color = "orange"
else:
color = "red"
formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
formatted_text += "\n\n"
return formatted_text
def format_json_for_display(transcription_data):
"""Format transcription data as pretty JSON string"""
return json.dumps(transcription_data, indent=2, ensure_ascii=False)
async def process_transcription(audio_file):
"""Process audio transcription"""
if audio_file is None:
return "Please upload an audio file first.", "", "", ""
try:
# Read audio file as bytes
with open(audio_file, 'rb') as f:
audio_bytes = f.read()
# Call transcription API
transcription_result = await transcribe(audio_bytes)
# Extract information
full_text = transcription_result.get('full_text', '')
segments = transcription_result.get('segments', [])
language = transcription_result.get('language_detected', 'Unknown')
processing_time = transcription_result.get('processing_time_seconds', 0)
# Format results
status = f"βœ… Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"
# Create formatted outputs
segments_formatted = format_transcription_segments(segments)
# Format JSON for display
json_formatted = format_json_for_display(transcription_result)
return status, full_text, segments_formatted, json_formatted
except Exception as e:
return f"❌ Error during transcription: {str(e)}", "", "", ""
def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
"""Synchronously transcribe an audio file using AI-powered speech recognition.
This function provides a synchronous wrapper around the async transcription process,
converting audio files to text using advanced speech recognition. It handles the
async/await complexity internally and returns detailed transcription results including
the full text, timestamped segments, language detection, and processing statistics.
Args:
audio_file (str): Full URL to the input audio file to be transcribed
(supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)
Returns:
tuple: A tuple containing four string elements:
- status (str): Status message indicating success with language and processing time,
or error information if transcription failed
- full_text (str): Complete transcription as plain text, or empty string on error
- segments_formatted (str): Formatted text showing timestamped segments with
start/end times and confidence scores, or empty string on error
- json_formatted (str): Pretty-formatted JSON string containing complete transcription
data including word-level timestamps and metadata, or empty string on error.
The JSON structure includes:
* "filename": original audio filename
* "language_detected": detected language code (e.g., "en", "es", "fr")
* "full_text": complete transcription text
* "segments": array of text segments with timing and word breakdowns
* "processing_time_seconds": time taken for transcription
Each segment contains: start/end times, text, and words array with individual
word timestamps and confidence scores (0.0-1.0 range)
Example:
status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
if "βœ…" in status:
print(f"Success: {status}")
print(f"Transcription: {text}")
print(f"Segments: {segments}")
else:
print(f"Error: {status}")
Note:
- Automatically detects language in the audio file
- Provides word-level and segment-level timestamps for precise audio editing
- Returns confidence scores for quality assessment
- Handles various audio formats and sample rates automatically
- Processing time depends on audio length and complexity
- All timestamps are provided in seconds with decimal precision
- Function blocks until transcription is complete (synchronous)
- For async usage, use process_transcription() directly instead
"""
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(
process_transcription(audio_file)
)
loop.close()
return result
except Exception as e:
return f"❌ Error: {str(e)}", "", "", ""
def create_audio_transcription_tab():
"""Create the audio transcription tab interface"""
gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
gr.Markdown("**Powered by Modal Labs**")
gr.Image(
value="assets/modal-logo.png",
show_label=False,
container=False,
show_fullscreen_button=False,
show_download_button=False,
width=200,
height=200
)
with gr.Row():
with gr.Column(scale=2):
# File upload
audio_input = gr.Audio(
label="πŸ“€ Upload Audio File",
type="filepath"
)
# Audio info
duration_info = gr.Markdown("No file uploaded")
status_info = gr.Markdown("Ready to transcribe")
# Transcribe button
transcribe_btn = gr.Button("🎀 Start Transcription", variant="primary", size="lg")
# Status message
status_msg = gr.Markdown("")
# Results section
with gr.Row():
with gr.Column():
# Full transcription
full_text_output = gr.Textbox(
label="πŸ“ Full Transcription",
lines=10,
max_lines=20,
placeholder="Transcription will appear here..."
)
with gr.Column():
# Segmented transcription with timestamps
segments_output = gr.Markdown(
label="⏱️ Timestamped Segments",
value="Segments with timestamps will appear here..."
)
# JSON Results section
with gr.Row():
with gr.Column():
gr.Markdown("### πŸ“„ JSON Results")
json_output = gr.Textbox(
label="Complete JSON Data",
lines=15,
max_lines=25,
placeholder="JSON transcription data will appear here...",
show_copy_button=True
)
# Event handlers
audio_input.change(
fn=update_transcription_info,
inputs=[audio_input],
outputs=[duration_info, status_info]
)
transcribe_btn.click(
fn=transcribe_audio_sync,
inputs=[audio_input],
outputs=[status_msg, full_text_output, segments_output, json_output]
)
# Usage tips
with gr.Accordion("πŸ“‹ Transcription Guide", open=False):
gr.Markdown("""
**🎀 Supported Features:**
- **Multiple Languages**: Automatic language detection
- **High Accuracy**: Professional-grade transcription
- **Word Timestamps**: Precise timing for each word
- **Confidence Scores**: Quality indicators for each word
- **JSON Output**: Complete structured data
**πŸ“ File Requirements:**
- **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
- **Duration**: Best results with files under 10 minutes
- **Quality**: Clear audio produces better quality results
**πŸ’‘ Tips:**
- Use high-quality audio for best results
- Consider splitting long files into segments
- Copy JSON data using the copy button for easy access
- JSON contains all metadata including word-level timestamps
**πŸ“Š JSON Structure:**
- **full_text**: Complete transcription text
- **segments**: Timestamped text segments
- **language_detected**: Detected language code
- **processing_time_seconds**: API processing duration
""")