|
|
|
import asyncio |
|
import json |
|
|
|
import gradio as gr |
|
|
|
from utils.audio_utils import load_audio_info, format_time |
|
from utils.transcription_utils import transcribe |
|
|
|
|
|
def update_transcription_info(audio_file): |
|
"""This should not be used by agents, only for UI updates""" |
|
if audio_file is None: |
|
return "No file uploaded", "Ready to transcribe" |
|
|
|
audio_data, sample_rate, duration = load_audio_info(audio_file) |
|
|
|
if audio_data is None: |
|
return "β Could not read audio file", "File error" |
|
|
|
duration_text = f"π File duration: {format_time(duration)} ({duration:.1f} seconds)" |
|
status_text = f"π΅ Sample rate: {sample_rate:,} Hz | Ready for transcription" |
|
|
|
return duration_text, status_text |
|
|
|
|
|
def format_transcription_segments(segments): |
|
"""Format transcription segments with timestamps""" |
|
if not segments: |
|
return "No segments found" |
|
|
|
formatted_text = "" |
|
for i, segment in enumerate(segments): |
|
start_time = segment.get('start', 0) |
|
end_time = segment.get('end', 0) |
|
text = segment.get('text', '').strip() |
|
|
|
if text: |
|
formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n" |
|
formatted_text += f"{text}\n\n" |
|
|
|
return formatted_text |
|
|
|
|
|
def format_word_level_transcription(segments): |
|
"""Format word-level transcription with confidence scores""" |
|
if not segments: |
|
return "No word-level data available" |
|
|
|
formatted_text = "" |
|
for segment in segments: |
|
words = segment.get('words', []) |
|
if words: |
|
for word in words: |
|
word_text = word.get('word', '') |
|
confidence = word.get('score', 0) |
|
start_time = word.get('start', 0) |
|
|
|
|
|
if confidence > 0.9: |
|
color = "green" |
|
elif confidence > 0.7: |
|
color = "orange" |
|
else: |
|
color = "red" |
|
|
|
formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> ' |
|
formatted_text += "\n\n" |
|
|
|
return formatted_text |
|
|
|
|
|
def format_json_for_display(transcription_data): |
|
"""Format transcription data as pretty JSON string""" |
|
return json.dumps(transcription_data, indent=2, ensure_ascii=False) |
|
|
|
|
|
async def process_transcription(audio_file): |
|
"""Process audio transcription""" |
|
if audio_file is None: |
|
return "Please upload an audio file first.", "", "", "" |
|
|
|
try: |
|
|
|
with open(audio_file, 'rb') as f: |
|
audio_bytes = f.read() |
|
|
|
|
|
transcription_result = await transcribe(audio_bytes) |
|
|
|
|
|
full_text = transcription_result.get('full_text', '') |
|
segments = transcription_result.get('segments', []) |
|
language = transcription_result.get('language_detected', 'Unknown') |
|
processing_time = transcription_result.get('processing_time_seconds', 0) |
|
|
|
|
|
status = f"β
Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s" |
|
|
|
|
|
segments_formatted = format_transcription_segments(segments) |
|
|
|
|
|
json_formatted = format_json_for_display(transcription_result) |
|
|
|
return status, full_text, segments_formatted, json_formatted |
|
|
|
except Exception as e: |
|
return f"β Error during transcription: {str(e)}", "", "", "" |
|
|
|
|
|
def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]: |
|
"""Synchronously transcribe an audio file using AI-powered speech recognition. |
|
|
|
This function provides a synchronous wrapper around the async transcription process, |
|
converting audio files to text using advanced speech recognition. It handles the |
|
async/await complexity internally and returns detailed transcription results including |
|
the full text, timestamped segments, language detection, and processing statistics. |
|
|
|
Args: |
|
audio_file (str): Full URL to the input audio file to be transcribed |
|
(supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats) |
|
|
|
Returns: |
|
tuple: A tuple containing four string elements: |
|
- status (str): Status message indicating success with language and processing time, |
|
or error information if transcription failed |
|
- full_text (str): Complete transcription as plain text, or empty string on error |
|
- segments_formatted (str): Formatted text showing timestamped segments with |
|
start/end times and confidence scores, or empty string on error |
|
- json_formatted (str): Pretty-formatted JSON string containing complete transcription |
|
data including word-level timestamps and metadata, or empty string on error. |
|
The JSON structure includes: |
|
* "filename": original audio filename |
|
* "language_detected": detected language code (e.g., "en", "es", "fr") |
|
* "full_text": complete transcription text |
|
* "segments": array of text segments with timing and word breakdowns |
|
* "processing_time_seconds": time taken for transcription |
|
Each segment contains: start/end times, text, and words array with individual |
|
word timestamps and confidence scores (0.0-1.0 range) |
|
|
|
Example: |
|
status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3") |
|
if "β
" in status: |
|
print(f"Success: {status}") |
|
print(f"Transcription: {text}") |
|
print(f"Segments: {segments}") |
|
else: |
|
print(f"Error: {status}") |
|
|
|
Note: |
|
- Automatically detects language in the audio file |
|
- Provides word-level and segment-level timestamps for precise audio editing |
|
- Returns confidence scores for quality assessment |
|
- Handles various audio formats and sample rates automatically |
|
- Processing time depends on audio length and complexity |
|
- All timestamps are provided in seconds with decimal precision |
|
- Function blocks until transcription is complete (synchronous) |
|
- For async usage, use process_transcription() directly instead |
|
""" |
|
try: |
|
loop = asyncio.new_event_loop() |
|
asyncio.set_event_loop(loop) |
|
result = loop.run_until_complete( |
|
process_transcription(audio_file) |
|
) |
|
loop.close() |
|
return result |
|
except Exception as e: |
|
return f"β Error: {str(e)}", "", "", "" |
|
|
|
def create_audio_transcription_tab(): |
|
"""Create the audio transcription tab interface""" |
|
|
|
gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.") |
|
gr.Markdown("**Powered by Modal Labs**") |
|
gr.Image( |
|
value="assets/modal-logo.png", |
|
show_label=False, |
|
container=False, |
|
show_fullscreen_button=False, |
|
show_download_button=False, |
|
width=200, |
|
height=200 |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=2): |
|
|
|
audio_input = gr.Audio( |
|
label="π€ Upload Audio File", |
|
type="filepath" |
|
) |
|
|
|
|
|
duration_info = gr.Markdown("No file uploaded") |
|
status_info = gr.Markdown("Ready to transcribe") |
|
|
|
|
|
transcribe_btn = gr.Button("π€ Start Transcription", variant="primary", size="lg") |
|
|
|
|
|
status_msg = gr.Markdown("") |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
full_text_output = gr.Textbox( |
|
label="π Full Transcription", |
|
lines=10, |
|
max_lines=20, |
|
placeholder="Transcription will appear here..." |
|
) |
|
|
|
with gr.Column(): |
|
|
|
segments_output = gr.Markdown( |
|
label="β±οΈ Timestamped Segments", |
|
value="Segments with timestamps will appear here..." |
|
) |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
gr.Markdown("### π JSON Results") |
|
json_output = gr.Textbox( |
|
label="Complete JSON Data", |
|
lines=15, |
|
max_lines=25, |
|
placeholder="JSON transcription data will appear here...", |
|
show_copy_button=True |
|
) |
|
|
|
|
|
audio_input.change( |
|
fn=update_transcription_info, |
|
inputs=[audio_input], |
|
outputs=[duration_info, status_info] |
|
) |
|
|
|
transcribe_btn.click( |
|
fn=transcribe_audio_sync, |
|
inputs=[audio_input], |
|
outputs=[status_msg, full_text_output, segments_output, json_output] |
|
) |
|
|
|
|
|
with gr.Accordion("π Transcription Guide", open=False): |
|
gr.Markdown(""" |
|
**π€ Supported Features:** |
|
- **Multiple Languages**: Automatic language detection |
|
- **High Accuracy**: Professional-grade transcription |
|
- **Word Timestamps**: Precise timing for each word |
|
- **Confidence Scores**: Quality indicators for each word |
|
- **JSON Output**: Complete structured data |
|
|
|
**π File Requirements:** |
|
- **Formats**: MP3, WAV, M4A, FLAC, OGG, and more |
|
- **Duration**: Best results with files under 10 minutes |
|
- **Quality**: Clear audio produces better quality results |
|
|
|
**π‘ Tips:** |
|
- Use high-quality audio for best results |
|
- Consider splitting long files into segments |
|
- Copy JSON data using the copy button for easy access |
|
- JSON contains all metadata including word-level timestamps |
|
|
|
**π JSON Structure:** |
|
- **full_text**: Complete transcription text |
|
- **segments**: Timestamped text segments |
|
- **language_detected**: Detected language code |
|
- **processing_time_seconds**: API processing duration |
|
""") |
|
|