File size: 10,356 Bytes
0a0ea7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 |
# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
import asyncio
import json
import gradio as gr
from utils.audio_utils import load_audio_info, format_time
from utils.transcription_utils import transcribe
def update_transcription_info(audio_file):
"""This should not be used by agents, only for UI updates"""
if audio_file is None:
return "No file uploaded", "Ready to transcribe"
audio_data, sample_rate, duration = load_audio_info(audio_file)
if audio_data is None:
return "β Could not read audio file", "File error"
duration_text = f"π File duration: {format_time(duration)} ({duration:.1f} seconds)"
status_text = f"π΅ Sample rate: {sample_rate:,} Hz | Ready for transcription"
return duration_text, status_text
def format_transcription_segments(segments):
"""Format transcription segments with timestamps"""
if not segments:
return "No segments found"
formatted_text = ""
for i, segment in enumerate(segments):
start_time = segment.get('start', 0)
end_time = segment.get('end', 0)
text = segment.get('text', '').strip()
if text:
formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
formatted_text += f"{text}\n\n"
return formatted_text
def format_word_level_transcription(segments):
"""Format word-level transcription with confidence scores"""
if not segments:
return "No word-level data available"
formatted_text = ""
for segment in segments:
words = segment.get('words', [])
if words:
for word in words:
word_text = word.get('word', '')
confidence = word.get('score', 0)
start_time = word.get('start', 0)
# Color code based on confidence
if confidence > 0.9:
color = "green"
elif confidence > 0.7:
color = "orange"
else:
color = "red"
formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
formatted_text += "\n\n"
return formatted_text
def format_json_for_display(transcription_data):
"""Format transcription data as pretty JSON string"""
return json.dumps(transcription_data, indent=2, ensure_ascii=False)
async def process_transcription(audio_file):
"""Process audio transcription"""
if audio_file is None:
return "Please upload an audio file first.", "", "", ""
try:
# Read audio file as bytes
with open(audio_file, 'rb') as f:
audio_bytes = f.read()
# Call transcription API
transcription_result = await transcribe(audio_bytes)
# Extract information
full_text = transcription_result.get('full_text', '')
segments = transcription_result.get('segments', [])
language = transcription_result.get('language_detected', 'Unknown')
processing_time = transcription_result.get('processing_time_seconds', 0)
# Format results
status = f"β
Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"
# Create formatted outputs
segments_formatted = format_transcription_segments(segments)
# Format JSON for display
json_formatted = format_json_for_display(transcription_result)
return status, full_text, segments_formatted, json_formatted
except Exception as e:
return f"β Error during transcription: {str(e)}", "", "", ""
def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
"""Synchronously transcribe an audio file using AI-powered speech recognition.
This function provides a synchronous wrapper around the async transcription process,
converting audio files to text using advanced speech recognition. It handles the
async/await complexity internally and returns detailed transcription results including
the full text, timestamped segments, language detection, and processing statistics.
Args:
audio_file (str): Full URL to the input audio file to be transcribed
(supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)
Returns:
tuple: A tuple containing four string elements:
- status (str): Status message indicating success with language and processing time,
or error information if transcription failed
- full_text (str): Complete transcription as plain text, or empty string on error
- segments_formatted (str): Formatted text showing timestamped segments with
start/end times and confidence scores, or empty string on error
- json_formatted (str): Pretty-formatted JSON string containing complete transcription
data including word-level timestamps and metadata, or empty string on error.
The JSON structure includes:
* "filename": original audio filename
* "language_detected": detected language code (e.g., "en", "es", "fr")
* "full_text": complete transcription text
* "segments": array of text segments with timing and word breakdowns
* "processing_time_seconds": time taken for transcription
Each segment contains: start/end times, text, and words array with individual
word timestamps and confidence scores (0.0-1.0 range)
Example:
status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
if "β
" in status:
print(f"Success: {status}")
print(f"Transcription: {text}")
print(f"Segments: {segments}")
else:
print(f"Error: {status}")
Note:
- Automatically detects language in the audio file
- Provides word-level and segment-level timestamps for precise audio editing
- Returns confidence scores for quality assessment
- Handles various audio formats and sample rates automatically
- Processing time depends on audio length and complexity
- All timestamps are provided in seconds with decimal precision
- Function blocks until transcription is complete (synchronous)
- For async usage, use process_transcription() directly instead
"""
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
result = loop.run_until_complete(
process_transcription(audio_file)
)
loop.close()
return result
except Exception as e:
return f"β Error: {str(e)}", "", "", ""
def create_audio_transcription_tab():
"""Create the audio transcription tab interface"""
gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
gr.Markdown("**Powered by Modal Labs**")
gr.Image(
value="assets/modal-logo.png",
show_label=False,
container=False,
show_fullscreen_button=False,
show_download_button=False,
width=200,
height=200
)
with gr.Row():
with gr.Column(scale=2):
# File upload
audio_input = gr.Audio(
label="π€ Upload Audio File",
type="filepath"
)
# Audio info
duration_info = gr.Markdown("No file uploaded")
status_info = gr.Markdown("Ready to transcribe")
# Transcribe button
transcribe_btn = gr.Button("π€ Start Transcription", variant="primary", size="lg")
# Status message
status_msg = gr.Markdown("")
# Results section
with gr.Row():
with gr.Column():
# Full transcription
full_text_output = gr.Textbox(
label="π Full Transcription",
lines=10,
max_lines=20,
placeholder="Transcription will appear here..."
)
with gr.Column():
# Segmented transcription with timestamps
segments_output = gr.Markdown(
label="β±οΈ Timestamped Segments",
value="Segments with timestamps will appear here..."
)
# JSON Results section
with gr.Row():
with gr.Column():
gr.Markdown("### π JSON Results")
json_output = gr.Textbox(
label="Complete JSON Data",
lines=15,
max_lines=25,
placeholder="JSON transcription data will appear here...",
show_copy_button=True
)
# Event handlers
audio_input.change(
fn=update_transcription_info,
inputs=[audio_input],
outputs=[duration_info, status_info]
)
transcribe_btn.click(
fn=transcribe_audio_sync,
inputs=[audio_input],
outputs=[status_msg, full_text_output, segments_output, json_output]
)
# Usage tips
with gr.Accordion("π Transcription Guide", open=False):
gr.Markdown("""
**π€ Supported Features:**
- **Multiple Languages**: Automatic language detection
- **High Accuracy**: Professional-grade transcription
- **Word Timestamps**: Precise timing for each word
- **Confidence Scores**: Quality indicators for each word
- **JSON Output**: Complete structured data
**π File Requirements:**
- **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
- **Duration**: Best results with files under 10 minutes
- **Quality**: Clear audio produces better quality results
**π‘ Tips:**
- Use high-quality audio for best results
- Consider splitting long files into segments
- Copy JSON data using the copy button for easy access
- JSON contains all metadata including word-level timestamps
**π JSON Structure:**
- **full_text**: Complete transcription text
- **segments**: Timestamped text segments
- **language_detected**: Detected language code
- **processing_time_seconds**: API processing duration
""")
|