Spaces:

Agents-MCP-Hackathon
/

AudioEditor

Running

AudioEditor / tabs /audio_transcription_tab.py

Ahmet Emre Şafak

initial commit

0a0ea7b 2 days ago

10.4 kB

	# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
	import asyncio
	import json

	import gradio as gr

	from utils.audio_utils import load_audio_info, format_time
	from utils.transcription_utils import transcribe


	def update_transcription_info(audio_file):
	"""This should not be used by agents, only for UI updates"""
	if audio_file is None:
	return "No file uploaded", "Ready to transcribe"

	audio_data, sample_rate, duration = load_audio_info(audio_file)

	if audio_data is None:
	return "❌ Could not read audio file", "File error"

	duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
	status_text = f"🎵 Sample rate: {sample_rate:,} Hz \| Ready for transcription"

	return duration_text, status_text


	def format_transcription_segments(segments):
	"""Format transcription segments with timestamps"""
	if not segments:
	return "No segments found"

	formatted_text = ""
	for i, segment in enumerate(segments):
	start_time = segment.get('start', 0)
	end_time = segment.get('end', 0)
	text = segment.get('text', '').strip()

	if text:
	formatted_text += f"[{format_time(start_time)} - {format_time(end_time)}]\n"
	formatted_text += f"{text}\n\n"

	return formatted_text


	def format_word_level_transcription(segments):
	"""Format word-level transcription with confidence scores"""
	if not segments:
	return "No word-level data available"

	formatted_text = ""
	for segment in segments:
	words = segment.get('words', [])
	if words:
	for word in words:
	word_text = word.get('word', '')
	confidence = word.get('score', 0)
	start_time = word.get('start', 0)

	# Color code based on confidence
	if confidence > 0.9:
	color = "green"
	elif confidence > 0.7:
	color = "orange"
	else:
	color = "red"

	formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
	formatted_text += "\n\n"

	return formatted_text


	def format_json_for_display(transcription_data):
	"""Format transcription data as pretty JSON string"""
	return json.dumps(transcription_data, indent=2, ensure_ascii=False)


	async def process_transcription(audio_file):
	"""Process audio transcription"""
	if audio_file is None:
	return "Please upload an audio file first.", "", "", ""

	try:
	# Read audio file as bytes
	with open(audio_file, 'rb') as f:
	audio_bytes = f.read()

	# Call transcription API
	transcription_result = await transcribe(audio_bytes)

	# Extract information
	full_text = transcription_result.get('full_text', '')
	segments = transcription_result.get('segments', [])
	language = transcription_result.get('language_detected', 'Unknown')
	processing_time = transcription_result.get('processing_time_seconds', 0)

	# Format results
	status = f"✅ Transcription completed! Language: {language} \| Processing time: {processing_time:.1f}s"

	# Create formatted outputs
	segments_formatted = format_transcription_segments(segments)

	# Format JSON for display
	json_formatted = format_json_for_display(transcription_result)

	return status, full_text, segments_formatted, json_formatted

	except Exception as e:
	return f"❌ Error during transcription: {str(e)}", "", "", ""


	def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
	"""Synchronously transcribe an audio file using AI-powered speech recognition.

	This function provides a synchronous wrapper around the async transcription process,
	converting audio files to text using advanced speech recognition. It handles the
	async/await complexity internally and returns detailed transcription results including
	the full text, timestamped segments, language detection, and processing statistics.

	Args:
	audio_file (str): Full URL to the input audio file to be transcribed
	(supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)

	Returns:
	tuple: A tuple containing four string elements:
	- status (str): Status message indicating success with language and processing time,
	or error information if transcription failed
	- full_text (str): Complete transcription as plain text, or empty string on error
	- segments_formatted (str): Formatted text showing timestamped segments with
	start/end times and confidence scores, or empty string on error
	- json_formatted (str): Pretty-formatted JSON string containing complete transcription
	data including word-level timestamps and metadata, or empty string on error.
	The JSON structure includes:
	* "filename": original audio filename
	* "language_detected": detected language code (e.g., "en", "es", "fr")
	* "full_text": complete transcription text
	* "segments": array of text segments with timing and word breakdowns
	* "processing_time_seconds": time taken for transcription
	Each segment contains: start/end times, text, and words array with individual
	word timestamps and confidence scores (0.0-1.0 range)

	Example:
	status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
	if "✅" in status:
	print(f"Success: {status}")
	print(f"Transcription: {text}")
	print(f"Segments: {segments}")
	else:
	print(f"Error: {status}")

	Note:
	- Automatically detects language in the audio file
	- Provides word-level and segment-level timestamps for precise audio editing
	- Returns confidence scores for quality assessment
	- Handles various audio formats and sample rates automatically
	- Processing time depends on audio length and complexity
	- All timestamps are provided in seconds with decimal precision
	- Function blocks until transcription is complete (synchronous)
	- For async usage, use process_transcription() directly instead
	"""
	try:
	loop = asyncio.new_event_loop()
	asyncio.set_event_loop(loop)
	result = loop.run_until_complete(
	process_transcription(audio_file)
	)
	loop.close()
	return result
	except Exception as e:
	return f"❌ Error: {str(e)}", "", "", ""

	def create_audio_transcription_tab():
	"""Create the audio transcription tab interface"""

	gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
	gr.Markdown("Powered by Modal Labs")
	gr.Image(
	value="assets/modal-logo.png",
	show_label=False,
	container=False,
	show_fullscreen_button=False,
	show_download_button=False,
	width=200,
	height=200
	)

	with gr.Row():
	with gr.Column(scale=2):
	# File upload
	audio_input = gr.Audio(
	label="📤 Upload Audio File",
	type="filepath"
	)

	# Audio info
	duration_info = gr.Markdown("No file uploaded")
	status_info = gr.Markdown("Ready to transcribe")

	# Transcribe button
	transcribe_btn = gr.Button("🎤 Start Transcription", variant="primary", size="lg")

	# Status message
	status_msg = gr.Markdown("")

	# Results section
	with gr.Row():
	with gr.Column():
	# Full transcription
	full_text_output = gr.Textbox(
	label="📝 Full Transcription",
	lines=10,
	max_lines=20,
	placeholder="Transcription will appear here..."
	)

	with gr.Column():
	# Segmented transcription with timestamps
	segments_output = gr.Markdown(
	label="⏱️ Timestamped Segments",
	value="Segments with timestamps will appear here..."
	)

	# JSON Results section
	with gr.Row():
	with gr.Column():
	gr.Markdown("### 📄 JSON Results")
	json_output = gr.Textbox(
	label="Complete JSON Data",
	lines=15,
	max_lines=25,
	placeholder="JSON transcription data will appear here...",
	show_copy_button=True
	)

	# Event handlers
	audio_input.change(
	fn=update_transcription_info,
	inputs=[audio_input],
	outputs=[duration_info, status_info]
	)

	transcribe_btn.click(
	fn=transcribe_audio_sync,
	inputs=[audio_input],
	outputs=[status_msg, full_text_output, segments_output, json_output]
	)

	# Usage tips
	with gr.Accordion("📋 Transcription Guide", open=False):
	gr.Markdown("""
	🎤 Supported Features:
	- Multiple Languages: Automatic language detection
	- High Accuracy: Professional-grade transcription
	- Word Timestamps: Precise timing for each word
	- Confidence Scores: Quality indicators for each word
	- JSON Output: Complete structured data

	📁 File Requirements:
	- Formats: MP3, WAV, M4A, FLAC, OGG, and more
	- Duration: Best results with files under 10 minutes
	- Quality: Clear audio produces better quality results

	💡 Tips:
	- Use high-quality audio for best results
	- Consider splitting long files into segments
	- Copy JSON data using the copy button for easy access
	- JSON contains all metadata including word-level timestamps

	📊 JSON Structure:
	- full_text: Complete transcription text
	- segments: Timestamped text segments
	- language_detected: Detected language code
	- processing_time_seconds: API processing duration
	""")