File size: 10,356 Bytes
0a0ea7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
import asyncio
import json

import gradio as gr

from utils.audio_utils import load_audio_info, format_time
from utils.transcription_utils import transcribe


def update_transcription_info(audio_file):
    """This should not be used by agents, only for UI updates"""
    if audio_file is None:
        return "No file uploaded", "Ready to transcribe"

    audio_data, sample_rate, duration = load_audio_info(audio_file)

    if audio_data is None:
        return "❌ Could not read audio file", "File error"

    duration_text = f"πŸ“ File duration: {format_time(duration)} ({duration:.1f} seconds)"
    status_text = f"🎡 Sample rate: {sample_rate:,} Hz | Ready for transcription"

    return duration_text, status_text


def format_transcription_segments(segments):
    """Format transcription segments with timestamps"""
    if not segments:
        return "No segments found"

    formatted_text = ""
    for i, segment in enumerate(segments):
        start_time = segment.get('start', 0)
        end_time = segment.get('end', 0)
        text = segment.get('text', '').strip()

        if text:
            formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
            formatted_text += f"{text}\n\n"

    return formatted_text


def format_word_level_transcription(segments):
    """Format word-level transcription with confidence scores"""
    if not segments:
        return "No word-level data available"

    formatted_text = ""
    for segment in segments:
        words = segment.get('words', [])
        if words:
            for word in words:
                word_text = word.get('word', '')
                confidence = word.get('score', 0)
                start_time = word.get('start', 0)

                # Color code based on confidence
                if confidence > 0.9:
                    color = "green"
                elif confidence > 0.7:
                    color = "orange"
                else:
                    color = "red"

                formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
            formatted_text += "\n\n"

    return formatted_text


def format_json_for_display(transcription_data):
    """Format transcription data as pretty JSON string"""
    return json.dumps(transcription_data, indent=2, ensure_ascii=False)


async def process_transcription(audio_file):
    """Process audio transcription"""
    if audio_file is None:
        return "Please upload an audio file first.", "", "", ""

    try:
        # Read audio file as bytes
        with open(audio_file, 'rb') as f:
            audio_bytes = f.read()

        # Call transcription API
        transcription_result = await transcribe(audio_bytes)

        # Extract information
        full_text = transcription_result.get('full_text', '')
        segments = transcription_result.get('segments', [])
        language = transcription_result.get('language_detected', 'Unknown')
        processing_time = transcription_result.get('processing_time_seconds', 0)

        # Format results
        status = f"βœ… Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"

        # Create formatted outputs
        segments_formatted = format_transcription_segments(segments)

        # Format JSON for display
        json_formatted = format_json_for_display(transcription_result)

        return status, full_text, segments_formatted, json_formatted

    except Exception as e:
        return f"❌ Error during transcription: {str(e)}", "", "", ""


def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
    """Synchronously transcribe an audio file using AI-powered speech recognition.

    This function provides a synchronous wrapper around the async transcription process,
    converting audio files to text using advanced speech recognition. It handles the
    async/await complexity internally and returns detailed transcription results including
    the full text, timestamped segments, language detection, and processing statistics.

    Args:
        audio_file (str): Full URL to the input audio file to be transcribed
                         (supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)

    Returns:
        tuple: A tuple containing four string elements:
            - status (str): Status message indicating success with language and processing time,
              or error information if transcription failed
            - full_text (str): Complete transcription as plain text, or empty string on error
            - segments_formatted (str): Formatted text showing timestamped segments with
              start/end times and confidence scores, or empty string on error
            - json_formatted (str): Pretty-formatted JSON string containing complete transcription
              data including word-level timestamps and metadata, or empty string on error.
              The JSON structure includes:
              * "filename": original audio filename
              * "language_detected": detected language code (e.g., "en", "es", "fr")
              * "full_text": complete transcription text
              * "segments": array of text segments with timing and word breakdowns
              * "processing_time_seconds": time taken for transcription
              Each segment contains: start/end times, text, and words array with individual
              word timestamps and confidence scores (0.0-1.0 range)

    Example:
        status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
        if "βœ…" in status:
            print(f"Success: {status}")
            print(f"Transcription: {text}")
            print(f"Segments: {segments}")
        else:
            print(f"Error: {status}")

    Note:
        - Automatically detects language in the audio file
        - Provides word-level and segment-level timestamps for precise audio editing
        - Returns confidence scores for quality assessment
        - Handles various audio formats and sample rates automatically
        - Processing time depends on audio length and complexity
        - All timestamps are provided in seconds with decimal precision
        - Function blocks until transcription is complete (synchronous)
        - For async usage, use process_transcription() directly instead
    """
    try:
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        result = loop.run_until_complete(
            process_transcription(audio_file)
        )
        loop.close()
        return result
    except Exception as e:
        return f"❌ Error: {str(e)}", "", "", ""

def create_audio_transcription_tab():
    """Create the audio transcription tab interface"""

    gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
    gr.Markdown("**Powered by Modal Labs**")
    gr.Image(
        value="assets/modal-logo.png",
        show_label=False,
        container=False,
        show_fullscreen_button=False,
        show_download_button=False,
        width=200,
        height=200
    )

    with gr.Row():
        with gr.Column(scale=2):
            # File upload
            audio_input = gr.Audio(
                label="πŸ“€ Upload Audio File",
                type="filepath"
            )

            # Audio info
            duration_info = gr.Markdown("No file uploaded")
            status_info = gr.Markdown("Ready to transcribe")

            # Transcribe button
            transcribe_btn = gr.Button("🎀 Start Transcription", variant="primary", size="lg")

            # Status message
            status_msg = gr.Markdown("")

    # Results section
    with gr.Row():
        with gr.Column():
            # Full transcription
            full_text_output = gr.Textbox(
                label="πŸ“ Full Transcription",
                lines=10,
                max_lines=20,
                placeholder="Transcription will appear here..."
            )

        with gr.Column():
            # Segmented transcription with timestamps
            segments_output = gr.Markdown(
                label="⏱️ Timestamped Segments",
                value="Segments with timestamps will appear here..."
            )

    # JSON Results section
    with gr.Row():
        with gr.Column():
            gr.Markdown("### πŸ“„ JSON Results")
            json_output = gr.Textbox(
                label="Complete JSON Data",
                lines=15,
                max_lines=25,
                placeholder="JSON transcription data will appear here...",
                show_copy_button=True
            )

    # Event handlers
    audio_input.change(
        fn=update_transcription_info,
        inputs=[audio_input],
        outputs=[duration_info, status_info]
    )

    transcribe_btn.click(
        fn=transcribe_audio_sync,
        inputs=[audio_input],
        outputs=[status_msg, full_text_output, segments_output, json_output]
    )

    # Usage tips
    with gr.Accordion("πŸ“‹ Transcription Guide", open=False):
        gr.Markdown("""
        **🎀 Supported Features:**
        - **Multiple Languages**: Automatic language detection
        - **High Accuracy**: Professional-grade transcription
        - **Word Timestamps**: Precise timing for each word
        - **Confidence Scores**: Quality indicators for each word
        - **JSON Output**: Complete structured data

        **πŸ“ File Requirements:**
        - **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
        - **Duration**: Best results with files under 10 minutes
        - **Quality**: Clear audio produces better quality results

        **πŸ’‘ Tips:**
        - Use high-quality audio for best results
        - Consider splitting long files into segments
        - Copy JSON data using the copy button for easy access
        - JSON contains all metadata including word-level timestamps

        **πŸ“Š JSON Structure:**
        - **full_text**: Complete transcription text
        - **segments**: Timestamped text segments
        - **language_detected**: Detected language code
        - **processing_time_seconds**: API processing duration
        """)