Ahmet Emre Şafak commited on
Commit
0a0ea7b
·
0 Parent(s):

initial commit

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ .env
2
+ .venv/
3
+ /.idea/
4
+ .gradio/
5
+ **/__pycache__/
6
+ .DS_STORE
7
+ **/.DS_STORE
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.11
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Foo
3
+ emoji: 📚
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 5.33.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: i'm trying to learn gradio
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from dotenv import load_dotenv
3
+
4
+ from tabs.audio_cutter_tab import create_audio_cutter_tab
5
+ from tabs.audio_effects_tab import create_audio_effects_tab
6
+ from tabs.audio_merger_tab import create_audio_merger_tab
7
+ from tabs.audio_transcription_tab import create_audio_transcription_tab
8
+
9
+
10
+ def create_app():
11
+ """Create the main Gradio application with multiple tabs"""
12
+
13
+ with gr.Blocks(title="Audio Toolkit", theme=gr.themes.Soft()) as app:
14
+ gr.Markdown("# 🎵 Audio Toolkit")
15
+ gr.Markdown("A comprehensive audio processing toolkit with multiple tools.")
16
+
17
+ with gr.Tabs():
18
+ # Audio Cutter Tab
19
+ with gr.TabItem("✂️ Audio Cutter"):
20
+ create_audio_cutter_tab()
21
+
22
+ # Placeholder for future tabs
23
+ with gr.TabItem("🔗 Audio Merger"):
24
+ create_audio_merger_tab()
25
+
26
+ with gr.TabItem("🎛️ Audio Effects"):
27
+ create_audio_effects_tab()
28
+
29
+ with gr.TabItem("📊 Audio Transcription"):
30
+ create_audio_transcription_tab()
31
+ return app
32
+
33
+
34
+ if __name__ == "__main__":
35
+ load_dotenv()
36
+ gradio_app = create_app()
37
+ gradio_app.launch(mcp_server=True)
assets/modal-logo.png ADDED
pyproject.toml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "AudioEditor"
3
+ version = "0.1.0"
4
+ description = "Edit your audio files with ease using this Gradio component."
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "aiohttp>=3.12.11",
9
+ "dotenv>=0.9.9",
10
+ "gradio>=5.33.0",
11
+ "gradio-audiogrid>=0.0.2",
12
+ "librosa",
13
+ "mcp",
14
+ "numpy>=2.3.0",
15
+ "soundfile>=0.13.1",
16
+ ]
requirements.txt ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.12.11
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ attrs==25.3.0
8
+ audioread==3.0.1
9
+ backports-tarfile==1.2.0
10
+ certifi==2025.4.26
11
+ cffi==1.17.1
12
+ charset-normalizer==3.4.2
13
+ click==8.2.1
14
+ decorator==5.2.1
15
+ docutils==0.21.2
16
+ dotenv==0.9.9
17
+ fastapi==0.115.12
18
+ ffmpy==0.6.0
19
+ filelock==3.18.0
20
+ frozenlist==1.6.2
21
+ fsspec==2025.5.1
22
+ gradio==5.33.0
23
+ gradio-audiogrid==0.0.2
24
+ gradio-client==1.10.2
25
+ groovy==0.1.2
26
+ h11==0.16.0
27
+ hf-xet==1.1.3
28
+ httpcore==1.0.9
29
+ httpx==0.28.1
30
+ httpx-sse==0.4.0
31
+ huggingface-hub==0.32.4
32
+ id==1.5.0
33
+ idna==3.10
34
+ importlib-metadata==8.7.0
35
+ jaraco-classes==3.4.0
36
+ jaraco-context==6.0.1
37
+ jaraco-functools==4.1.0
38
+ jinja2==3.1.6
39
+ joblib==1.5.1
40
+ keyring==25.6.0
41
+ lazy-loader==0.4
42
+ librosa==0.11.0
43
+ llvmlite==0.44.0
44
+ markdown-it-py==3.0.0
45
+ markupsafe==3.0.2
46
+ mcp==1.9.3
47
+ mdurl==0.1.2
48
+ more-itertools==10.7.0
49
+ msgpack==1.1.0
50
+ multidict==6.4.4
51
+ nh3==0.2.21
52
+ numba==0.61.2
53
+ numpy==2.2.6
54
+ orjson==3.10.18
55
+ packaging==25.0
56
+ pandas==2.3.0
57
+ pillow==11.2.1
58
+ platformdirs==4.3.8
59
+ pooch==1.8.2
60
+ propcache==0.3.1
61
+ pycparser==2.22
62
+ pydantic==2.11.5
63
+ pydantic-core==2.33.2
64
+ pydantic-settings==2.9.1
65
+ pydub==0.25.1
66
+ pygments==2.19.1
67
+ python-dateutil==2.9.0.post0
68
+ python-dotenv==1.1.0
69
+ python-multipart==0.0.20
70
+ pytz==2025.2
71
+ pyyaml==6.0.2
72
+ readme-renderer==44.0
73
+ requests==2.32.3
74
+ requests-toolbelt==1.0.0
75
+ rfc3986==2.0.0
76
+ rich==14.0.0
77
+ ruff==0.11.13
78
+ safehttpx==0.1.6
79
+ scikit-learn==1.7.0
80
+ scipy==1.15.3
81
+ semantic-version==2.10.0
82
+ shellingham==1.5.4
83
+ six==1.17.0
84
+ sniffio==1.3.1
85
+ soundfile==0.13.1
86
+ soxr==0.5.0.post1
87
+ sse-starlette==2.3.6
88
+ starlette==0.46.2
89
+ threadpoolctl==3.6.0
90
+ tomlkit==0.13.3
91
+ tqdm==4.67.1
92
+ twine==6.1.0
93
+ typer==0.16.0
94
+ typing-extensions==4.14.0
95
+ typing-inspection==0.4.1
96
+ tzdata==2025.2
97
+ urllib3==2.4.0
98
+ uvicorn==0.34.3
99
+ websockets==15.0.1
100
+ yarl==1.20.0
101
+ zipp==3.22.0
tabs/__init__.py ADDED
File without changes
tabs/audio_cutter_tab.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tabs/audio_cutter_tab.py - Audio Cutter Tab Component
2
+ import gradio as gr
3
+ from numpy import ndarray
4
+
5
+ from utils.audio_utils import cut_audio, format_time, load_audio_info
6
+
7
+
8
+ def update_duration_info(audio_file):
9
+ """Update the duration info when a new file is uploaded
10
+
11
+ This function is designed for UI purposes only to update Gradio interface elements
12
+ when a new audio file is uploaded. It should NOT be used by agents or automated
13
+ systems as it returns Gradio update objects for slider components. Agents should
14
+ use the underlying audio utility functions directly instead.
15
+ """
16
+ print("Logging audiofile")
17
+ print(audio_file)
18
+ if audio_file is None:
19
+ return "No file uploaded", "Sample rate: N/A", gr.update(maximum=100), gr.update(maximum=100)
20
+
21
+ # Load audio info
22
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
23
+
24
+ if duration is None:
25
+ return "❌ Could not read audio file", "Sample rate: N/A", gr.update(maximum=100), gr.update(maximum=100)
26
+
27
+ duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
28
+ sample_rate_text = f"🎵 Sample rate: {sample_rate:,} Hz"
29
+
30
+ # Update sliders with new maximum
31
+ return duration_text, sample_rate_text, gr.update(maximum=duration, value=0), gr.update(maximum=duration,
32
+ value=min(30, duration))
33
+
34
+
35
+ def process_cut_audio(audio_file: str, _start_time: float, _end_time: float) -> tuple[tuple[int | float, ndarray] | None, str]:
36
+ """Process audio cutting to extract a segment from an audio file.
37
+
38
+ This function loads an audio file, validates the time parameters, and cuts out
39
+ a specific segment between the start and end times. It handles various audio
40
+ formats and provides detailed error messages for troubleshooting.
41
+
42
+ Args:
43
+ audio_file (str): Full URL to the input audio file to be cut
44
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
45
+ _start_time (float): Start time in seconds where the cut should begin
46
+ (will be clamped to 0 if negative)
47
+ _end_time (float): End time in seconds where the cut should end
48
+ (will be clamped to file duration if exceeds it)
49
+
50
+ Returns:
51
+ tuple: A tuple containing:
52
+ - First element: Either a tuple of (sample_rate: int, audio_data: array)
53
+ for the cut audio segment, or None if an error occurred
54
+ - Second element: A status message string indicating success with details
55
+ or error information
56
+
57
+ Example:
58
+ result, status = process_cut_audio("/path/to/audio.mp3", 10.5, 25.0)
59
+ if result is not None:
60
+ sample_rate, audio_data = result
61
+ print(f"Cut successful: {status}")
62
+ else:
63
+ print(f"Error: {status}")
64
+
65
+ Note:
66
+ - Time parameters are automatically validated and clamped to valid ranges
67
+ - Start time must be less than end time after validation
68
+ - Output audio data maintains the original sample rate
69
+ - Function returns user-friendly status messages for UI display
70
+ """
71
+ if audio_file is None:
72
+ return None, "Please upload an audio file first."
73
+
74
+ try:
75
+ # Load audio data and sample rate
76
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
77
+
78
+ if audio_data is None:
79
+ return None, "❌ Could not load audio file."
80
+
81
+ # Validate time inputs
82
+ if _start_time < 0:
83
+ _start_time = 0
84
+ if _end_time > duration:
85
+ _end_time = duration
86
+ if _start_time >= _end_time:
87
+ return None, f"Start time ({_start_time:.1f}s) must be less than end time ({_end_time:.1f}s)"
88
+
89
+ # Convert seconds to milliseconds for the cut_audio function
90
+ start_millis = int(_start_time * 1000)
91
+ end_millis = int(_end_time * 1000)
92
+
93
+ # Cut the audio using your function
94
+ cut_audio_data = cut_audio(audio_data, sample_rate, start_millis, end_millis)
95
+
96
+ # Create status message
97
+ cut_duration = (end_millis - start_millis) / 1000.0
98
+ status = f"✅ Audio cut successfully! Duration: {format_time(cut_duration)} (from {format_time(_start_time)} to {format_time(_end_time)})"
99
+
100
+ return (sample_rate, cut_audio_data,), status
101
+
102
+ except Exception as e:
103
+ return None, f"❌ Error cutting audio: {str(e)}"
104
+
105
+
106
+ def create_audio_cutter_tab():
107
+ """Create the audio cutter tab interface"""
108
+
109
+ gr.Markdown("Upload an audio file and specify the start and end times to cut a segment.")
110
+
111
+ with gr.Row():
112
+ with gr.Column(scale=2):
113
+ # File upload
114
+ audio_input = gr.Audio(
115
+ label="📤 Upload Audio File",
116
+ type="filepath"
117
+ )
118
+
119
+ # Audio info
120
+ duration_info = gr.Markdown("No file uploaded")
121
+ sample_rate_info = gr.Markdown("Sample rate: N/A")
122
+
123
+ # Time controls
124
+ with gr.Row():
125
+ start_time = gr.Slider(
126
+ minimum=0,
127
+ maximum=100,
128
+ value=0,
129
+ step=0.1,
130
+ label="⏱️ Start Time (seconds)",
131
+ info="When to start cutting"
132
+ )
133
+ end_time = gr.Slider(
134
+ minimum=0,
135
+ maximum=100,
136
+ value=30,
137
+ step=0.1,
138
+ label="⏱️ End Time (seconds)",
139
+ info="When to stop cutting"
140
+ )
141
+
142
+ # Cut button
143
+ cut_btn = gr.Button("✂️ Cut Audio", variant="primary", size="lg")
144
+
145
+ # Status message
146
+ status_msg = gr.Markdown("")
147
+
148
+ with gr.Column(scale=1):
149
+ # Output audio
150
+ audio_output = gr.Audio(
151
+ label="🎧 Cut Audio Result",
152
+ type="filepath"
153
+ )
154
+
155
+ # Download info
156
+ gr.Markdown(
157
+ "💾 **Download:** Right-click the audio player above and select 'Save audio as...' to download the cut audio file.")
158
+
159
+ # Event handlers
160
+ audio_input.change(
161
+ fn=update_duration_info,
162
+ inputs=[audio_input],
163
+ outputs=[duration_info, sample_rate_info, start_time, end_time]
164
+ )
165
+
166
+ cut_btn.click(
167
+ fn=process_cut_audio,
168
+ inputs=[audio_input, start_time, end_time],
169
+ outputs=[audio_output, status_msg]
170
+ )
171
+
172
+ # Usage tips
173
+ with gr.Accordion("📋 Usage Tips", open=False):
174
+ gr.Markdown("""
175
+ **Supported formats:** MP3, WAV, M4A, FLAC, OGG, and more
176
+
177
+ **How to use:**
178
+ 1. Upload your audio file
179
+ 2. Check the duration and sample rate information
180
+ 3. Use the sliders to set start and end times
181
+ 4. Click "Cut Audio" to process
182
+ 5. Play the result and download if satisfied
183
+
184
+ **Tips:**
185
+ - The sliders will automatically adjust to your file's duration
186
+ - Sample rate is preserved in the output file
187
+ - You can fine-tune times using the slider or type exact values
188
+ - Output format is WAV for best compatibility
189
+ """)
tabs/audio_effects_tab.py ADDED
@@ -0,0 +1,549 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tabs/audio_effects_tab.py - Audio Effects Tab Component
2
+ import gradio as gr
3
+ import numpy as np
4
+
5
+ from utils.audio_utils import (
6
+ load_audio_info, format_time, normalize_audio, adjust_volume,
7
+ apply_fade_in, apply_fade_out, reverse_audio, apply_speed_change,
8
+ trim_silence, get_audio_stats
9
+ )
10
+
11
+
12
+ def update_audio_info(audio_file):
13
+ """This component should not be used by agents or automated systems."""
14
+ if audio_file is None:
15
+ return "No file uploaded", "Audio stats: N/A"
16
+
17
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
18
+
19
+ if audio_data is None:
20
+ return "❌ Could not read audio file", "Audio stats: N/A"
21
+
22
+ # Get audio statistics
23
+ stats = get_audio_stats(audio_data, sample_rate)
24
+
25
+ duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
26
+ stats_text = f"🎵 Sample rate: {sample_rate:,} Hz | Peak: {stats['peak_level_db']:.1f} dB | RMS: {stats['rms_level_db']:.1f} dB"
27
+
28
+ return duration_text, stats_text
29
+
30
+
31
+ def apply_normalization(audio_file: str, target_level: float) -> tuple[tuple[int, any] | None, str]:
32
+ """Apply audio normalization to adjust the peak level of an audio file.
33
+
34
+ This function loads an audio file and applies normalization to adjust the peak
35
+ audio level to a specified target level in decibels (dB). It provides before
36
+ and after statistics to show the effect of the normalization process.
37
+
38
+ Args:
39
+ audio_file (str): Full url to the input audio file to be normalized
40
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
41
+ target_level (float): Target peak level in decibels (dB) for normalization
42
+ (typical values: -3dB to -12dB for optimal loudness,
43
+ negative values reduce volume, positive values increase)
44
+
45
+ Returns:
46
+ tuple: A tuple containing:
47
+ - First element: Either a tuple of (sample_rate: int, normalized_audio_data: array)
48
+ for the normalized audio result, or None if an error occurred
49
+ - Second element: A status message string showing before/after peak levels
50
+ and success/error information
51
+
52
+ Example:
53
+ result, status = apply_normalization("url/to/audio.mp3", -6.0)
54
+ if result is not None:
55
+ sample_rate, audio_data = result
56
+ print(f"Normalization successful: {status}")
57
+ else:
58
+ print(f"Error: {status}")
59
+
60
+ Note:
61
+ - Target level is specified in decibels (dB)
62
+ - Common target levels: -6dB (moderate), -3dB (loud), -12dB (quiet)
63
+ - Positive target levels will amplify audio and may cause clipping
64
+ - Negative target levels will reduce audio volume
65
+ - Function preserves original sample rate and audio format
66
+ - Returns comparison statistics showing original vs normalized peak levels
67
+ """
68
+ if audio_file is None:
69
+ return None, "Please upload an audio file first."
70
+
71
+ try:
72
+ audio_data, sample_rate, _ = load_audio_info(audio_file)
73
+ if audio_data is None:
74
+ return None, "❌ Could not load audio file."
75
+
76
+ # Apply normalization
77
+ normalized_audio = normalize_audio(audio_data, target_level)
78
+
79
+ # Get stats for before/after comparison
80
+ original_stats = get_audio_stats(audio_data, sample_rate)
81
+ new_stats = get_audio_stats(normalized_audio, sample_rate)
82
+
83
+ status = f"✅ Normalization applied! Peak: {original_stats['peak_level_db']:.1f}dB → {new_stats['peak_level_db']:.1f}dB"
84
+
85
+ return (sample_rate, normalized_audio), status
86
+
87
+ except Exception as e:
88
+ return None, f"❌ Error applying normalization: {str(e)}"
89
+
90
+
91
+ def apply_volume_adjustment(audio_file: str, gain_db: float) -> tuple[tuple[int, any] | None, str]:
92
+ """Apply volume adjustment to an audio file using gain in decibels.
93
+
94
+ This function loads an audio file and applies a volume adjustment by the specified
95
+ gain amount in decibels. Positive values increase volume, negative values decrease
96
+ volume. The function also detects potential audio clipping when volume is increased.
97
+
98
+ Args:
99
+ audio_file (str): Full URL to the input audio file to be processed
100
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
101
+ gain_db (float): Volume adjustment in decibels (dB)
102
+ (positive values increase volume, negative values decrease volume,
103
+ typical range: -20dB to +20dB, values above +6dB may cause clipping)
104
+
105
+ Returns:
106
+ tuple: A tuple containing:
107
+ - First element: Either a tuple of (sample_rate: int, adjusted_audio_data: array)
108
+ for the volume-adjusted audio result, or None if an error occurred
109
+ - Second element: A status message string indicating success with gain applied
110
+ and clipping warning if detected, or error information
111
+
112
+ Example:
113
+ result, status = apply_volume_adjustment("url/to/audio.mp3", -3.0)
114
+ if result is not None:
115
+ sample_rate, audio_data = result
116
+ print(f"Volume adjustment successful: {status}")
117
+ else:
118
+ print(f"Error: {status}")
119
+
120
+ Note:
121
+ - Gain is specified in decibels (dB): +6dB doubles volume, -6dB halves volume
122
+ - Positive gain values may cause clipping (distortion) if audio becomes too loud
123
+ - Function automatically detects and warns about clipping
124
+ - Preserves original sample rate and audio format
125
+ - Safe range is typically -20dB to +6dB to avoid quality issues
126
+ """
127
+ if audio_file is None:
128
+ return None, "Please upload an audio file first."
129
+
130
+ try:
131
+ audio_data, sample_rate, _ = load_audio_info(audio_file)
132
+ if audio_data is None:
133
+ return None, "❌ Could not load audio file."
134
+
135
+ # Apply volume adjustment
136
+ adjusted_audio = adjust_volume(audio_data, gain_db)
137
+
138
+ # Check for clipping
139
+ if np.max(np.abs(adjusted_audio)) > 1.0:
140
+ status = f"⚠️ Volume adjusted by {gain_db:+.1f}dB (WARNING: Clipping detected!)"
141
+ else:
142
+ status = f"✅ Volume adjusted by {gain_db:+.1f}dB"
143
+
144
+ return (sample_rate, adjusted_audio), status
145
+
146
+ except Exception as e:
147
+ return None, f"❌ Error adjusting volume: {str(e)}"
148
+
149
+
150
+ def apply_fades(audio_file: str, fade_in_ms: int, fade_out_ms: int) -> tuple[tuple[int, any] | None, str]:
151
+ """Apply fade-in and fade-out effects to an audio file.
152
+
153
+ This function loads an audio file and applies smooth fade-in and/or fade-out effects
154
+ to eliminate abrupt starts/stops and create professional-sounding transitions.
155
+ Fade effects gradually increase or decrease volume over the specified time periods.
156
+
157
+ Args:
158
+ audio_file (str): Full URL to the input audio file to be processed
159
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
160
+ fade_in_ms (int): Duration of fade-in effect in milliseconds
161
+ (0 = no fade-in, typical values: 100-3000ms)
162
+ fade_out_ms (int): Duration of fade-out effect in milliseconds
163
+ (0 = no fade-out, typical values: 100-3000ms)
164
+
165
+ Returns:
166
+ tuple: A tuple containing:
167
+ - First element: Either a tuple of (sample_rate: int, faded_audio_data: array)
168
+ for the fade-processed audio result, or None if an error occurred
169
+ - Second element: A status message string showing applied fade durations
170
+ or error information
171
+
172
+ Example:
173
+ result, status = apply_fades("url/to/audio.mp3", 1000, 2000)
174
+ if result is not None:
175
+ sample_rate, audio_data = result
176
+ print(f"Fades applied: {status}")
177
+ else:
178
+ print(f"Error: {status}")
179
+
180
+ Note:
181
+ - Fade durations are specified in milliseconds (1000ms = 1 second)
182
+ - Set either parameter to 0 to skip that fade effect
183
+ - Fade-in gradually increases volume from silence at the beginning
184
+ - Fade-out gradually decreases volume to silence at the end
185
+ - Typical fade durations: 100-500ms (quick), 1000-3000ms (smooth)
186
+ - Preserves original sample rate and audio format
187
+ - Fades are applied as smooth linear or exponential curves
188
+ """
189
+ if audio_file is None:
190
+ return None, "Please upload an audio file first."
191
+
192
+ try:
193
+ audio_data, sample_rate, _ = load_audio_info(audio_file)
194
+ if audio_data is None:
195
+ return None, "❌ Could not load audio file."
196
+
197
+ processed_audio = audio_data.copy()
198
+
199
+ # Apply fade in
200
+ if fade_in_ms > 0:
201
+ processed_audio = apply_fade_in(processed_audio, sample_rate, fade_in_ms)
202
+
203
+ # Apply fade out
204
+ if fade_out_ms > 0:
205
+ processed_audio = apply_fade_out(processed_audio, sample_rate, fade_out_ms)
206
+
207
+ status = f"✅ Fades applied! Fade in: {fade_in_ms}ms, Fade out: {fade_out_ms}ms"
208
+
209
+ return (sample_rate, processed_audio), status
210
+
211
+ except Exception as e:
212
+ return None, f"❌ Error applying fades: {str(e)}"
213
+
214
+
215
+ def apply_reverse(audio_file: str) -> tuple[tuple[int, any] | None, str]:
216
+ """Reverse the playback direction of an audio file.
217
+
218
+ This function loads an audio file and reverses the audio data so that it plays
219
+ backwards. This creates a reverse playback effect commonly used for artistic
220
+ purposes, sound design, or audio analysis.
221
+
222
+ Args:
223
+ audio_file (str): Full URL to the input audio file to be reversed
224
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
225
+
226
+ Returns:
227
+ tuple: A tuple containing:
228
+ - First element: Either a tuple of (sample_rate: int, reversed_audio_data: array)
229
+ for the reversed audio result, or None if an error occurred
230
+ - Second element: A status message string indicating successful reversal
231
+ or error information
232
+
233
+ Example:
234
+ result, status = apply_reverse("url/to/audio.mp3")
235
+ if result is not None:
236
+ sample_rate, audio_data = result
237
+ print(f"Audio reversed: {status}")
238
+ else:
239
+ print(f"Error: {status}")
240
+
241
+ Note:
242
+ - Reverses the entire audio file from end to beginning
243
+ - Preserves original sample rate, duration, and audio quality
244
+ - Commonly used for creative effects, sound design, or subliminal messaging detection
245
+ - The reversed audio will have the same duration as the original
246
+ - All audio characteristics (pitch, timbre) are preserved but played backwards
247
+ - Works with both mono and stereo audio files
248
+ """
249
+ if audio_file is None:
250
+ return None, "Please upload an audio file first."
251
+
252
+ try:
253
+ audio_data, sample_rate, _ = load_audio_info(audio_file)
254
+ if audio_data is None:
255
+ return None, "❌ Could not load audio file."
256
+
257
+ # Reverse audio
258
+ reversed_audio = reverse_audio(audio_data)
259
+
260
+ status = "✅ Audio reversed successfully!"
261
+
262
+ return (sample_rate, reversed_audio), status
263
+
264
+ except Exception as e:
265
+ return None, f"❌ Error reversing audio: {str(e)}"
266
+
267
+
268
+ def apply_speed_adjustment(audio_file: str, speed_factor: float) -> tuple[tuple[int, any] | None, str]:
269
+ """Apply speed adjustment to an audio file, changing playback speed and pitch.
270
+
271
+ This function loads an audio file and adjusts its playback speed by the specified
272
+ factor. Speed changes affect both duration and pitch - faster speeds increase pitch
273
+ and reduce duration, while slower speeds decrease pitch and increase duration.
274
+
275
+ Args:
276
+ audio_file (str): Full URL to the input audio file to be processed
277
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
278
+ speed_factor (float): Speed multiplication factor
279
+ (1.0 = normal speed, 2.0 = double speed/half duration,
280
+ 0.5 = half speed/double duration, typical range: 0.25 to 4.0)
281
+
282
+ Returns:
283
+ tuple: A tuple containing:
284
+ - First element: Either a tuple of (sample_rate: int, speed_adjusted_audio_data: array)
285
+ for the speed-adjusted audio result, or None if an error occurred
286
+ - Second element: A status message string showing speed factor and duration change
287
+ or error information
288
+
289
+ Example:
290
+ result, status = apply_speed_adjustment("url/to/audio.mp3", 1.5)
291
+ if result is not None:
292
+ sample_rate, audio_data = result
293
+ print(f"Speed adjusted: {status}")
294
+ else:
295
+ print(f"Error: {status}")
296
+
297
+ Note:
298
+ - Speed factor affects both playback speed and pitch (chipmunk/slow-motion effect)
299
+ - Values > 1.0 increase speed and pitch, reduce duration
300
+ - Values < 1.0 decrease speed and pitch, increase duration
301
+ - Common values: 0.5 (half speed), 1.25 (25% faster), 2.0 (double speed)
302
+ - Extreme values (< 0.25 or > 4.0) may result in poor audio quality
303
+ - For pitch-preserving speed changes, use time-stretching instead
304
+ - Preserves original sample rate but changes audio duration
305
+ """
306
+ if audio_file is None:
307
+ return None, "Please upload an audio file first."
308
+
309
+ try:
310
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
311
+ if audio_data is None:
312
+ return None, "❌ Could not load audio file."
313
+
314
+ # Apply speed change
315
+ speed_adjusted_audio = apply_speed_change(audio_data, speed_factor)
316
+
317
+ new_duration = len(speed_adjusted_audio) / sample_rate
318
+ status = f"✅ Speed adjusted by {speed_factor}x! Duration: {format_time(duration)} → {format_time(new_duration)}"
319
+
320
+ return (sample_rate, speed_adjusted_audio), status
321
+
322
+ except Exception as e:
323
+ return None, f"❌ Error adjusting speed: {str(e)}"
324
+
325
+
326
+ def apply_silence_trimming(audio_file: str, threshold_db: float) -> tuple[tuple[int, any] | None, str]:
327
+ """Trim silence from the beginning and end of an audio file.
328
+
329
+ This function loads an audio file and automatically removes silent or very quiet
330
+ sections from the beginning and end based on a specified volume threshold.
331
+ This is useful for cleaning up recordings and removing unwanted quiet sections.
332
+
333
+ Args:
334
+ audio_file (str): Full URL to the input audio file to be processed
335
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
336
+ threshold_db (float): Volume threshold in decibels below which audio is considered silence
337
+ (typical values: -30dB to -60dB, lower values = more aggressive trimming,
338
+ -40dB is a good starting point for most recordings)
339
+
340
+ Returns:
341
+ tuple: A tuple containing:
342
+ - First element: Either a tuple of (sample_rate: int, trimmed_audio_data: array)
343
+ for the silence-trimmed audio result, or None if an error occurred
344
+ - Second element: A status message string showing original and new duration
345
+ or error information
346
+
347
+ Example:
348
+ result, status = apply_silence_trimming("url/to/audio.mp3", -40.0)
349
+ if result is not None:
350
+ sample_rate, audio_data = result
351
+ print(f"Silence trimmed: {status}")
352
+ else:
353
+ print(f"Error: {status}")
354
+
355
+ Note:
356
+ - Threshold is specified in decibels (dB) - more negative values = quieter threshold
357
+ - Common thresholds: -30dB (conservative), -40dB (moderate), -60dB (aggressive)
358
+ - Only trims from beginning and end, preserves silence within the audio
359
+ - Useful for removing recording artifacts, room tone, or equipment noise
360
+ - May significantly reduce file duration depending on original content
361
+ - Preserves original sample rate and audio quality
362
+ - Be careful with very low thresholds as they may trim wanted quiet content
363
+ """
364
+ if audio_file is None:
365
+ return None, "Please upload an audio file first."
366
+
367
+ try:
368
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
369
+ if audio_data is None:
370
+ return None, "❌ Could not load audio file."
371
+
372
+ # Trim silence
373
+ trimmed_audio = trim_silence(audio_data, threshold_db)
374
+
375
+ new_duration = len(trimmed_audio) / sample_rate
376
+ status = f"✅ Silence trimmed! Duration: {format_time(duration)} → {format_time(new_duration)}"
377
+
378
+ return (sample_rate, trimmed_audio), status
379
+
380
+ except Exception as e:
381
+ return None, f"❌ Error trimming silence: {str(e)}"
382
+
383
+
384
+ def create_audio_effects_tab():
385
+ """Create the audio effects tab interface"""
386
+
387
+ gr.Markdown("Apply various audio effects and processing to your audio files.")
388
+
389
+ with gr.Row():
390
+ with gr.Column(scale=2):
391
+ # File upload
392
+ audio_input = gr.Audio(
393
+ label="📤 Upload Audio File",
394
+ type="filepath"
395
+ )
396
+
397
+ # Audio info
398
+ duration_info = gr.Markdown("No file uploaded")
399
+ stats_info = gr.Markdown("Audio stats: N/A")
400
+
401
+ # Effects sections
402
+ with gr.Accordion("🔊 Volume & Normalization", open=True):
403
+ with gr.Row():
404
+ normalize_btn = gr.Button("📏 Normalize Audio", variant="secondary")
405
+ target_level = gr.Slider(
406
+ minimum=-20,
407
+ maximum=0,
408
+ value=-3,
409
+ step=0.1,
410
+ label="Target Level (dB)"
411
+ )
412
+
413
+ with gr.Row():
414
+ volume_btn = gr.Button("🔊 Adjust Volume", variant="secondary")
415
+ gain_db = gr.Slider(
416
+ minimum=-20,
417
+ maximum=20,
418
+ value=0,
419
+ step=0.1,
420
+ label="Volume Gain (dB)"
421
+ )
422
+
423
+ with gr.Accordion("🎭 Fade Effects", open=True):
424
+ with gr.Row():
425
+ fade_btn = gr.Button("📈 Apply Fades", variant="secondary")
426
+ fade_in_ms = gr.Slider(
427
+ minimum=0,
428
+ maximum=5000,
429
+ value=100,
430
+ step=10,
431
+ label="Fade In (ms)"
432
+ )
433
+ fade_out_ms = gr.Slider(
434
+ minimum=0,
435
+ maximum=5000,
436
+ value=100,
437
+ step=10,
438
+ label="Fade Out (ms)"
439
+ )
440
+
441
+ with gr.Accordion("⚡ Time & Speed Effects", open=True):
442
+ with gr.Row():
443
+ reverse_btn = gr.Button("↩️ Reverse Audio", variant="secondary")
444
+ speed_btn = gr.Button("⏩ Change Speed", variant="secondary")
445
+ speed_factor = gr.Slider(
446
+ minimum=0.25,
447
+ maximum=4.0,
448
+ value=1.0,
449
+ step=0.1,
450
+ label="Speed Factor"
451
+ )
452
+
453
+ with gr.Accordion("✂️ Audio Cleanup", open=True):
454
+ with gr.Row():
455
+ trim_btn = gr.Button("🔇 Trim Silence", variant="secondary")
456
+ threshold_db = gr.Slider(
457
+ minimum=-60,
458
+ maximum=-10,
459
+ value=-40,
460
+ step=1,
461
+ label="Silence Threshold (dB)"
462
+ )
463
+
464
+ # Status message
465
+ status_msg = gr.Markdown("")
466
+
467
+ with gr.Column(scale=1):
468
+ # Output audio
469
+ audio_output = gr.Audio(
470
+ label="🎧 Processed Audio Result",
471
+ type="numpy"
472
+ )
473
+
474
+ # Download info
475
+ gr.Markdown("💾 **Download:** Right-click the audio player above and select 'Save audio as...'")
476
+
477
+ # Event handlers
478
+ audio_input.change(
479
+ fn=update_audio_info,
480
+ inputs=[audio_input],
481
+ outputs=[duration_info, stats_info]
482
+ )
483
+
484
+ # Normalization
485
+ normalize_btn.click(
486
+ fn=apply_normalization,
487
+ inputs=[audio_input, target_level],
488
+ outputs=[audio_output, status_msg]
489
+ )
490
+
491
+ # Volume adjustment
492
+ volume_btn.click(
493
+ fn=apply_volume_adjustment,
494
+ inputs=[audio_input, gain_db],
495
+ outputs=[audio_output, status_msg]
496
+ )
497
+
498
+ # Fades
499
+ fade_btn.click(
500
+ fn=apply_fades,
501
+ inputs=[audio_input, fade_in_ms, fade_out_ms],
502
+ outputs=[audio_output, status_msg]
503
+ )
504
+
505
+ # Reverse
506
+ reverse_btn.click(
507
+ fn=apply_reverse,
508
+ inputs=[audio_input],
509
+ outputs=[audio_output, status_msg]
510
+ )
511
+
512
+ # Speed change
513
+ speed_btn.click(
514
+ fn=apply_speed_adjustment,
515
+ inputs=[audio_input, speed_factor],
516
+ outputs=[audio_output, status_msg]
517
+ )
518
+
519
+ # Trim silence
520
+ trim_btn.click(
521
+ fn=apply_silence_trimming,
522
+ inputs=[audio_input, threshold_db],
523
+ outputs=[audio_output, status_msg]
524
+ )
525
+
526
+ # Usage tips
527
+ with gr.Accordion("📋 Effects Guide", open=False):
528
+ gr.Markdown("""
529
+ **🔊 Volume & Normalization:**
530
+ - **Normalize**: Adjusts peak level to target dB (recommended: -3dB)
531
+ - **Volume Gain**: Increase/decrease volume by specified dB
532
+
533
+ **🎭 Fade Effects:**
534
+ - **Fade In**: Gradually increase volume from silence
535
+ - **Fade Out**: Gradually decrease volume to silence
536
+
537
+ **⚡ Time & Speed:**
538
+ - **Reverse**: Play audio backwards
539
+ - **Speed**: Change playback speed (1.0 = normal, 2.0 = double, 0.5 = half)
540
+
541
+ **✂️ Cleanup:**
542
+ - **Trim Silence**: Remove quiet sections from start/end
543
+
544
+ **Tips:**
545
+ - Always check audio stats before processing
546
+ - Watch for clipping warnings when increasing volume
547
+ - Use normalization for consistent levels across multiple files
548
+ - Combine effects by processing sequentially
549
+ """)
tabs/audio_merger_tab.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tabs/audio_merger_tab.py - Audio Merger Tab Component
2
+ import gradio as gr
3
+ from gradio_audiogrid import AudioGrid
4
+ from numpy import ndarray
5
+
6
+ from utils.audio_utils import load_audio_info, format_time, merge_audio_files
7
+
8
+
9
+ def update_file_status(file_paths):
10
+ """Update the duration info when a new file is uploaded
11
+
12
+ This function is designed for UI purposes only to update Gradio interface elements
13
+ when a new audio file is uploaded. It should NOT be used by agents or automated
14
+ systems as it returns Gradio update objects for slider components. Agents should
15
+ use the underlying audio utility functions directly instead.
16
+ """
17
+ if not file_paths or len(file_paths) == 0:
18
+ return "No files uploaded yet", "🔄 Ready to upload audio files"
19
+
20
+ # Get info about uploaded files
21
+ total_duration = 0
22
+ valid_files = 0
23
+ file_info = []
24
+
25
+ for i, file_path in enumerate(file_paths):
26
+ try:
27
+ audio_data, sample_rate, duration = load_audio_info(file_path)
28
+ if audio_data is not None:
29
+ valid_files += 1
30
+ total_duration += duration
31
+ file_info.append(f" {i + 1}. {duration:.1f}s ({sample_rate:,} Hz)")
32
+ except:
33
+ file_info.append(f" {i + 1}. ❌ Invalid file")
34
+
35
+ if valid_files == 0:
36
+ status = "❌ No valid audio files found"
37
+ details = "Please upload valid audio files (MP3, WAV, FLAC, etc.)"
38
+ elif valid_files == 1:
39
+ status = f"📁 1 valid file uploaded ({format_time(total_duration)})"
40
+ details = "Add at least one more file to enable merging"
41
+ else:
42
+ status = f"📁 {valid_files} files ready ({format_time(total_duration)} total)"
43
+ details = f"Files in merge order:\n" + "\n".join(file_info[:5]) # Show first 5
44
+ if len(file_info) > 5:
45
+ details += f"\n ... and {len(file_info) - 5} more files"
46
+
47
+ return status, details
48
+
49
+
50
+ def process_merge(file_paths: list[str]) -> tuple[tuple[int, ndarray] | None, str]:
51
+ """Process the merging of multiple audio files into a single continuous audio file.
52
+
53
+ This function takes a list of audio file URLs and merges them sequentially into
54
+ one continuous audio file. It handles sample rate conversion, format normalization,
55
+ and provides detailed status information about the merge operation.
56
+
57
+ Args:
58
+ file_paths (list[str]): List of full URLs to audio files to be merged
59
+ (minimum 2 files required, supports MP3, WAV, M4A, FLAC, OGG, etc.)
60
+
61
+ Returns:
62
+ tuple: A tuple containing:
63
+ - First element: Either a tuple of (sample_rate: int, merged_audio_data: array)
64
+ for the merged audio result, or None if an error occurred
65
+ - Second element: A status message string with merge details and success/error info
66
+
67
+ Example:
68
+ result, status = process_merge(["url/to/file1.mp3", "url/to/file2.wav"])
69
+ if result is not None:
70
+ sample_rate, audio_data = result
71
+ print(f"Merge successful: {status}")
72
+ else:
73
+ print(f"Error: {status}")
74
+
75
+ Note:
76
+ - Requires at least 2 audio files to perform merge operation
77
+ - Files are merged in the order provided in the list
78
+ - Automatically handles sample rate conversion to match the first file
79
+ - Converts stereo files to mono for consistency
80
+ - Returns detailed status with duration and file information
81
+ - Output maintains the sample rate of the first valid audio file
82
+ """
83
+ if not file_paths or len(file_paths) < 2:
84
+ return None, "❌ Please upload at least 2 audio files to merge"
85
+
86
+ # Call the merge function
87
+ result, status = merge_audio_files(file_paths)
88
+
89
+ return result, status
90
+
91
+
92
+ def reset_everything():
93
+ """This component should not be used by agents or automated systems."""
94
+
95
+ return [], None, "No files uploaded yet", "🔄 Ready to upload audio files"
96
+
97
+
98
+ def create_audio_merger_tab():
99
+ """Create the audio merger tab interface"""
100
+
101
+ gr.Markdown("Upload multiple audio files and merge them in sequence. Drag to reorder files before merging.")
102
+
103
+ with gr.Row():
104
+ with gr.Column(scale=2):
105
+ # Audio Grid Upload
106
+ gr.Markdown("### 📁 Upload & Arrange Audio Files")
107
+
108
+ audio_files = AudioGrid(
109
+ value=[],
110
+ label="Drag files here or click to upload (supports MP3, WAV, FLAC, OGG, M4A, AAC)",
111
+ interactive=True,
112
+ )
113
+
114
+ # File status
115
+ file_status = gr.Textbox(
116
+ value="No files uploaded yet",
117
+ label="📊 Upload Status",
118
+ interactive=False,
119
+ lines=1
120
+ )
121
+
122
+ # Detailed file info
123
+ file_details = gr.Textbox(
124
+ value="🔄 Ready to upload audio files",
125
+ label="📋 File Details",
126
+ interactive=False,
127
+ lines=6
128
+ )
129
+
130
+ with gr.Column(scale=1):
131
+ gr.Markdown("### 🎛️ Merge Controls")
132
+
133
+ merge_btn = gr.Button(
134
+ "🎵 Merge Audio Files",
135
+ variant="primary",
136
+ size="lg"
137
+ )
138
+
139
+ clear_btn = gr.Button(
140
+ "🗑️ Clear All Files",
141
+ variant="secondary",
142
+ size="lg"
143
+ )
144
+
145
+ # Instructions
146
+ gr.Markdown("""
147
+ **📋 Instructions:**
148
+ 1. **Upload** 2+ audio files using drag & drop
149
+ 2. **Reorder** by dragging files in the grid
150
+ 3. **Merge** files in the displayed order
151
+ 4. **Download** the merged result
152
+
153
+ **🎯 Features:**
154
+ • Automatic sample rate conversion
155
+ • Stereo to mono conversion
156
+ • Duration calculations
157
+ • High-quality WAV output
158
+ """)
159
+
160
+ # Results section
161
+ with gr.Row():
162
+ with gr.Column(scale=1):
163
+ # Status output
164
+ merge_status = gr.Textbox(
165
+ value="Ready to merge! Upload at least 2 audio files to get started.",
166
+ label="🔍 Merge Status & Details",
167
+ interactive=False,
168
+ lines=8
169
+ )
170
+
171
+ with gr.Column(scale=1):
172
+ # Audio output
173
+ merged_audio = gr.Audio(
174
+ label="🎵 Merged Audio Result",
175
+ type="numpy",
176
+ interactive=False
177
+ )
178
+
179
+ # Event handlers
180
+ audio_files.change(
181
+ fn=update_file_status,
182
+ inputs=[audio_files],
183
+ outputs=[file_status, file_details]
184
+ )
185
+
186
+ merge_btn.click(
187
+ fn=process_merge,
188
+ inputs=[audio_files],
189
+ outputs=[merged_audio, merge_status]
190
+ )
191
+
192
+ clear_btn.click(
193
+ fn=reset_everything,
194
+ outputs=[audio_files, merged_audio, file_status, file_details]
195
+ )
tabs/audio_transcription_tab.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # tabs/audio_transcription_tab.py - Audio Transcription Tab Component
2
+ import asyncio
3
+ import json
4
+
5
+ import gradio as gr
6
+
7
+ from utils.audio_utils import load_audio_info, format_time
8
+ from utils.transcription_utils import transcribe
9
+
10
+
11
+ def update_transcription_info(audio_file):
12
+ """This should not be used by agents, only for UI updates"""
13
+ if audio_file is None:
14
+ return "No file uploaded", "Ready to transcribe"
15
+
16
+ audio_data, sample_rate, duration = load_audio_info(audio_file)
17
+
18
+ if audio_data is None:
19
+ return "❌ Could not read audio file", "File error"
20
+
21
+ duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
22
+ status_text = f"🎵 Sample rate: {sample_rate:,} Hz | Ready for transcription"
23
+
24
+ return duration_text, status_text
25
+
26
+
27
+ def format_transcription_segments(segments):
28
+ """Format transcription segments with timestamps"""
29
+ if not segments:
30
+ return "No segments found"
31
+
32
+ formatted_text = ""
33
+ for i, segment in enumerate(segments):
34
+ start_time = segment.get('start', 0)
35
+ end_time = segment.get('end', 0)
36
+ text = segment.get('text', '').strip()
37
+
38
+ if text:
39
+ formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
40
+ formatted_text += f"{text}\n\n"
41
+
42
+ return formatted_text
43
+
44
+
45
+ def format_word_level_transcription(segments):
46
+ """Format word-level transcription with confidence scores"""
47
+ if not segments:
48
+ return "No word-level data available"
49
+
50
+ formatted_text = ""
51
+ for segment in segments:
52
+ words = segment.get('words', [])
53
+ if words:
54
+ for word in words:
55
+ word_text = word.get('word', '')
56
+ confidence = word.get('score', 0)
57
+ start_time = word.get('start', 0)
58
+
59
+ # Color code based on confidence
60
+ if confidence > 0.9:
61
+ color = "green"
62
+ elif confidence > 0.7:
63
+ color = "orange"
64
+ else:
65
+ color = "red"
66
+
67
+ formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
68
+ formatted_text += "\n\n"
69
+
70
+ return formatted_text
71
+
72
+
73
+ def format_json_for_display(transcription_data):
74
+ """Format transcription data as pretty JSON string"""
75
+ return json.dumps(transcription_data, indent=2, ensure_ascii=False)
76
+
77
+
78
+ async def process_transcription(audio_file):
79
+ """Process audio transcription"""
80
+ if audio_file is None:
81
+ return "Please upload an audio file first.", "", "", ""
82
+
83
+ try:
84
+ # Read audio file as bytes
85
+ with open(audio_file, 'rb') as f:
86
+ audio_bytes = f.read()
87
+
88
+ # Call transcription API
89
+ transcription_result = await transcribe(audio_bytes)
90
+
91
+ # Extract information
92
+ full_text = transcription_result.get('full_text', '')
93
+ segments = transcription_result.get('segments', [])
94
+ language = transcription_result.get('language_detected', 'Unknown')
95
+ processing_time = transcription_result.get('processing_time_seconds', 0)
96
+
97
+ # Format results
98
+ status = f"✅ Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"
99
+
100
+ # Create formatted outputs
101
+ segments_formatted = format_transcription_segments(segments)
102
+
103
+ # Format JSON for display
104
+ json_formatted = format_json_for_display(transcription_result)
105
+
106
+ return status, full_text, segments_formatted, json_formatted
107
+
108
+ except Exception as e:
109
+ return f"❌ Error during transcription: {str(e)}", "", "", ""
110
+
111
+
112
+ def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
113
+ """Synchronously transcribe an audio file using AI-powered speech recognition.
114
+
115
+ This function provides a synchronous wrapper around the async transcription process,
116
+ converting audio files to text using advanced speech recognition. It handles the
117
+ async/await complexity internally and returns detailed transcription results including
118
+ the full text, timestamped segments, language detection, and processing statistics.
119
+
120
+ Args:
121
+ audio_file (str): Full URL to the input audio file to be transcribed
122
+ (supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)
123
+
124
+ Returns:
125
+ tuple: A tuple containing four string elements:
126
+ - status (str): Status message indicating success with language and processing time,
127
+ or error information if transcription failed
128
+ - full_text (str): Complete transcription as plain text, or empty string on error
129
+ - segments_formatted (str): Formatted text showing timestamped segments with
130
+ start/end times and confidence scores, or empty string on error
131
+ - json_formatted (str): Pretty-formatted JSON string containing complete transcription
132
+ data including word-level timestamps and metadata, or empty string on error.
133
+ The JSON structure includes:
134
+ * "filename": original audio filename
135
+ * "language_detected": detected language code (e.g., "en", "es", "fr")
136
+ * "full_text": complete transcription text
137
+ * "segments": array of text segments with timing and word breakdowns
138
+ * "processing_time_seconds": time taken for transcription
139
+ Each segment contains: start/end times, text, and words array with individual
140
+ word timestamps and confidence scores (0.0-1.0 range)
141
+
142
+ Example:
143
+ status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
144
+ if "✅" in status:
145
+ print(f"Success: {status}")
146
+ print(f"Transcription: {text}")
147
+ print(f"Segments: {segments}")
148
+ else:
149
+ print(f"Error: {status}")
150
+
151
+ Note:
152
+ - Automatically detects language in the audio file
153
+ - Provides word-level and segment-level timestamps for precise audio editing
154
+ - Returns confidence scores for quality assessment
155
+ - Handles various audio formats and sample rates automatically
156
+ - Processing time depends on audio length and complexity
157
+ - All timestamps are provided in seconds with decimal precision
158
+ - Function blocks until transcription is complete (synchronous)
159
+ - For async usage, use process_transcription() directly instead
160
+ """
161
+ try:
162
+ loop = asyncio.new_event_loop()
163
+ asyncio.set_event_loop(loop)
164
+ result = loop.run_until_complete(
165
+ process_transcription(audio_file)
166
+ )
167
+ loop.close()
168
+ return result
169
+ except Exception as e:
170
+ return f"❌ Error: {str(e)}", "", "", ""
171
+
172
+ def create_audio_transcription_tab():
173
+ """Create the audio transcription tab interface"""
174
+
175
+ gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
176
+ gr.Markdown("**Powered by Modal Labs**")
177
+ gr.Image(
178
+ value="assets/modal-logo.png",
179
+ show_label=False,
180
+ container=False,
181
+ show_fullscreen_button=False,
182
+ show_download_button=False,
183
+ width=200,
184
+ height=200
185
+ )
186
+
187
+ with gr.Row():
188
+ with gr.Column(scale=2):
189
+ # File upload
190
+ audio_input = gr.Audio(
191
+ label="📤 Upload Audio File",
192
+ type="filepath"
193
+ )
194
+
195
+ # Audio info
196
+ duration_info = gr.Markdown("No file uploaded")
197
+ status_info = gr.Markdown("Ready to transcribe")
198
+
199
+ # Transcribe button
200
+ transcribe_btn = gr.Button("🎤 Start Transcription", variant="primary", size="lg")
201
+
202
+ # Status message
203
+ status_msg = gr.Markdown("")
204
+
205
+ # Results section
206
+ with gr.Row():
207
+ with gr.Column():
208
+ # Full transcription
209
+ full_text_output = gr.Textbox(
210
+ label="📝 Full Transcription",
211
+ lines=10,
212
+ max_lines=20,
213
+ placeholder="Transcription will appear here..."
214
+ )
215
+
216
+ with gr.Column():
217
+ # Segmented transcription with timestamps
218
+ segments_output = gr.Markdown(
219
+ label="⏱️ Timestamped Segments",
220
+ value="Segments with timestamps will appear here..."
221
+ )
222
+
223
+ # JSON Results section
224
+ with gr.Row():
225
+ with gr.Column():
226
+ gr.Markdown("### 📄 JSON Results")
227
+ json_output = gr.Textbox(
228
+ label="Complete JSON Data",
229
+ lines=15,
230
+ max_lines=25,
231
+ placeholder="JSON transcription data will appear here...",
232
+ show_copy_button=True
233
+ )
234
+
235
+ # Event handlers
236
+ audio_input.change(
237
+ fn=update_transcription_info,
238
+ inputs=[audio_input],
239
+ outputs=[duration_info, status_info]
240
+ )
241
+
242
+ transcribe_btn.click(
243
+ fn=transcribe_audio_sync,
244
+ inputs=[audio_input],
245
+ outputs=[status_msg, full_text_output, segments_output, json_output]
246
+ )
247
+
248
+ # Usage tips
249
+ with gr.Accordion("📋 Transcription Guide", open=False):
250
+ gr.Markdown("""
251
+ **🎤 Supported Features:**
252
+ - **Multiple Languages**: Automatic language detection
253
+ - **High Accuracy**: Professional-grade transcription
254
+ - **Word Timestamps**: Precise timing for each word
255
+ - **Confidence Scores**: Quality indicators for each word
256
+ - **JSON Output**: Complete structured data
257
+
258
+ **📁 File Requirements:**
259
+ - **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
260
+ - **Duration**: Best results with files under 10 minutes
261
+ - **Quality**: Clear audio produces better quality results
262
+
263
+ **💡 Tips:**
264
+ - Use high-quality audio for best results
265
+ - Consider splitting long files into segments
266
+ - Copy JSON data using the copy button for easy access
267
+ - JSON contains all metadata including word-level timestamps
268
+
269
+ **📊 JSON Structure:**
270
+ - **full_text**: Complete transcription text
271
+ - **segments**: Timestamped text segments
272
+ - **language_detected**: Detected language code
273
+ - **processing_time_seconds**: API processing duration
274
+ """)
utils/__init__.py ADDED
File without changes
utils/audio_utils.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import numpy as np
3
+ import soundfile as sf
4
+ from numpy import ndarray
5
+
6
+
7
+ def load_audio(file_path: str) -> tuple[ndarray, int]:
8
+ """
9
+ Load audio file and return audio data and sample rate.
10
+
11
+ Args:
12
+ file_path (str): Path to the audio file.
13
+
14
+ Returns:
15
+ tuple: (audio_data, sample_rate)
16
+ """
17
+ audio_data, sample_rate = sf.read(file_path)
18
+ return audio_data, sample_rate
19
+
20
+
21
+ def cut_audio(_audio: ndarray, sampling_rate: int | float, start_millis: int, end_millis: int) -> ndarray:
22
+ """Cut audio array from start_millis to end_millis"""
23
+ start_sample = int(start_millis / 1000 * sampling_rate)
24
+ end_sample = int(end_millis / 1000 * sampling_rate)
25
+ return _audio[start_sample:end_sample]
26
+
27
+
28
+ def format_time(seconds):
29
+ """Format seconds into MM:SS format"""
30
+ minutes = int(seconds // 60)
31
+ secs = int(seconds % 60)
32
+ return f"{minutes:02d}:{secs:02d}"
33
+
34
+
35
+ def load_audio_info(audio_file):
36
+ """Load audio file and return audio data, sample rate, and duration info"""
37
+ if audio_file is None:
38
+ return None, None, None
39
+
40
+ try:
41
+ # Load audio data and sample rate
42
+ audio_data, sample_rate = sf.read(audio_file)
43
+
44
+ # Calculate duration
45
+ duration = len(audio_data) / sample_rate
46
+
47
+ return audio_data, sample_rate, duration
48
+ except Exception as e:
49
+ print(f"Error loading audio: {e}")
50
+ return None, None, None
51
+
52
+
53
+ def get_audio_duration(audio_file):
54
+ """Get just the duration of an audio file"""
55
+ try:
56
+ info = sf.info(audio_file)
57
+ return info.frames / info.samplerate
58
+ except Exception:
59
+ return None
60
+
61
+
62
+ def merge_audio_arrays(audios: list[ndarray]) -> ndarray:
63
+ """Merge multiple audio arrays by concatenation"""
64
+ return np.concatenate(audios)
65
+
66
+
67
+ def apply_fade_in(audio: ndarray, sample_rate: int, fade_duration_ms: int = 100) -> ndarray:
68
+ """Apply fade in effect to audio"""
69
+
70
+ fade_samples = int(fade_duration_ms / 1000 * sample_rate)
71
+ fade_samples = min(fade_samples, len(audio))
72
+
73
+ fade_curve = np.linspace(0, 1, fade_samples)
74
+ audio_copy = audio.copy()
75
+ audio_copy[:fade_samples] *= fade_curve
76
+
77
+ return audio_copy
78
+
79
+
80
+ def apply_fade_out(audio: ndarray, sample_rate: int, fade_duration_ms: int = 100) -> ndarray:
81
+ """Apply fade out effect to audio"""
82
+
83
+ fade_samples = int(fade_duration_ms / 1000 * sample_rate)
84
+ fade_samples = min(fade_samples, len(audio))
85
+
86
+ fade_curve = np.linspace(1, 0, fade_samples)
87
+ audio_copy = audio.copy()
88
+ audio_copy[-fade_samples:] *= fade_curve
89
+
90
+ return audio_copy
91
+
92
+
93
+ def normalize_audio(audio: ndarray, target_level: float = -3.0) -> ndarray:
94
+ """
95
+ Normalize audio to target level in dB
96
+ target_level: Target peak level in dB (e.g., -3.0 for -3dB)
97
+ """
98
+
99
+ # Calculate current peak level
100
+ peak = np.max(np.abs(audio))
101
+
102
+ if peak == 0:
103
+ return audio # Avoid division by zero for silent audio
104
+
105
+ # Convert target level from dB to linear scale
106
+ target_linear = 10 ** (target_level / 20)
107
+
108
+ # Calculate gain needed
109
+ gain = target_linear / peak
110
+
111
+ return audio * gain
112
+
113
+
114
+ def adjust_volume(audio: ndarray, gain_db: float) -> ndarray:
115
+ """
116
+ Adjust audio volume by specified gain in dB
117
+ gain_db: Gain in decibels (positive = louder, negative = quieter)
118
+ """
119
+ gain_linear = 10 ** (gain_db / 20)
120
+ return audio * gain_linear
121
+
122
+
123
+ def apply_silence(duration_ms: int, sample_rate: int) -> ndarray:
124
+ """Generate silence for specified duration"""
125
+
126
+ samples = int(duration_ms / 1000 * sample_rate)
127
+ return np.zeros(samples)
128
+
129
+
130
+ def reverse_audio(audio: ndarray) -> ndarray:
131
+ """Reverse audio playback"""
132
+ return np.flip(audio)
133
+
134
+
135
+ def apply_speed_change(audio: ndarray, speed_factor: float) -> ndarray:
136
+ """
137
+ Change playback speed without changing pitch (simple time-stretching)
138
+ speed_factor: 1.0 = normal, 2.0 = double speed, 0.5 = half speed
139
+ """
140
+
141
+ return librosa.effects.time_stretch(audio, rate=speed_factor)
142
+
143
+
144
+ def trim_silence(audio: ndarray, threshold_db: float = -40.0) -> ndarray:
145
+ """
146
+ Trim silence from beginning and end of audio
147
+ threshold_db: Silence threshold in dB
148
+ """
149
+
150
+ # Convert threshold to linear scale
151
+ threshold_linear = 10 ** (threshold_db / 20)
152
+
153
+ # Find non-silent regions
154
+ non_silent = np.abs(audio) > threshold_linear
155
+
156
+ if not np.any(non_silent):
157
+ return audio # All audio is below threshold
158
+
159
+ # Find first and last non-silent samples
160
+ first_non_silent = np.where(non_silent)[0][0]
161
+ last_non_silent = np.where(non_silent)[0][-1]
162
+
163
+ return audio[first_non_silent:last_non_silent + 1]
164
+
165
+
166
+ def get_audio_stats(audio: ndarray, sample_rate: int) -> dict:
167
+ """Get statistics about the audio"""
168
+
169
+ peak_level = np.max(np.abs(audio))
170
+ rms_level = np.sqrt(np.mean(audio ** 2))
171
+
172
+ # Convert to dB
173
+ peak_db = 20 * np.log10(peak_level) if peak_level > 0 else -np.inf
174
+ rms_db = 20 * np.log10(rms_level) if rms_level > 0 else -np.inf
175
+
176
+ return {
177
+ 'duration_seconds': len(audio) / sample_rate,
178
+ 'peak_level_db': peak_db,
179
+ 'rms_level_db': rms_db,
180
+ 'sample_rate': sample_rate,
181
+ 'samples': len(audio),
182
+ 'channels': 1 if len(audio.shape) == 1 else audio.shape[1]
183
+ }
184
+
185
+
186
+ def merge_audio_files(file_paths: list[str]) -> tuple[tuple[ndarray, int | float] | None, str]:
187
+ """
188
+ Merge multiple audio files by concatenating them
189
+
190
+ Args:
191
+ file_paths: List of audio file paths
192
+
193
+ Returns:
194
+ tuple: (sample_rate, merged_audio_array, status_message)
195
+ """
196
+ if not file_paths or len(file_paths) == 0:
197
+ return None, "❌ No audio files to merge"
198
+
199
+ if len(file_paths) == 1:
200
+ return None, "❌ Please upload at least 2 audio files to merge"
201
+
202
+ try:
203
+ merged_audio_segments = []
204
+ target_sample_rate = None
205
+ file_durations = []
206
+
207
+ for i, file_path in enumerate(file_paths):
208
+ # Load audio file
209
+ audio_data, sample_rate, duration = load_audio_info(file_path)
210
+
211
+ if audio_data is None:
212
+ continue
213
+
214
+ # Set target sample rate from first file
215
+ if target_sample_rate is None:
216
+ target_sample_rate = sample_rate
217
+ elif sample_rate != target_sample_rate:
218
+ # Resample if different sample rate
219
+ from scipy import signal
220
+ num_samples = int(len(audio_data) * target_sample_rate / sample_rate)
221
+ audio_data = signal.resample(audio_data, num_samples)
222
+
223
+ # Convert stereo to mono if needed
224
+ if len(audio_data.shape) > 1:
225
+ audio_data = np.mean(audio_data, axis=1)
226
+
227
+ merged_audio_segments.append(audio_data)
228
+ file_durations.append(len(audio_data) / target_sample_rate)
229
+
230
+ if not merged_audio_segments:
231
+ return None, "❌ No valid audio files found"
232
+
233
+ # Concatenate all audio arrays
234
+ final_audio = np.concatenate(merged_audio_segments)
235
+
236
+ # Create status message
237
+ total_duration = len(final_audio) / target_sample_rate
238
+
239
+ status = f"""✅ Successfully merged {len(file_paths)} audio files!
240
+
241
+ 🎵 **Merge Details:**
242
+ • Total duration: {format_time(total_duration)} ({total_duration:.2f} seconds)
243
+ • Sample rate: {target_sample_rate:,} Hz
244
+ • Files processed: {len(merged_audio_segments)}
245
+ • Individual durations: {', '.join([f'{d:.1f}s' for d in file_durations])}
246
+
247
+ 🎧 **Result:** Ready for playback and download!"""
248
+
249
+ return (target_sample_rate, final_audio), status
250
+
251
+ except Exception as e:
252
+ return None, f"❌ Error merging audio files: {str(e)}"
utils/transcription_utils.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/transcription_utils.py - Audio Transcription Utilities
2
+ import json
3
+ import os
4
+ from typing import Optional, Dict, Any
5
+
6
+ import aiohttp
7
+
8
+
9
+ async def _send_bytes_as_form_data(
10
+ file_bytes: bytes,
11
+ endpoint_url: str,
12
+ field_name: str = "file",
13
+ auth_token: Optional[str] = None,
14
+ content_type: str = "application/octet-stream"
15
+ ) -> Dict[str, Any]:
16
+ """
17
+ Send bytes as multipart form data POST request to an endpoint.
18
+
19
+ Args:
20
+ file_bytes: Bytes content to send
21
+ endpoint_url: URL endpoint to send the POST request to
22
+ field_name: Form field name for the file (default: "file")
23
+ auth_token: Optional bearer token for authorization
24
+ content_type: MIME type of the content (default: "application/octet-stream")
25
+
26
+ Returns:
27
+ Dictionary containing response status and data
28
+ """
29
+ # Create form data with the bytes
30
+ data = aiohttp.FormData()
31
+ data.add_field(
32
+ field_name,
33
+ file_bytes,
34
+ content_type=content_type
35
+ )
36
+
37
+ # Prepare headers
38
+ headers = {}
39
+ if auth_token:
40
+ headers['Authorization'] = f'Bearer {auth_token}'
41
+
42
+ # Send POST request with form data
43
+ async with aiohttp.ClientSession() as session:
44
+ async with session.post(
45
+ endpoint_url,
46
+ data=data,
47
+ headers=headers if headers else None
48
+ ) as response:
49
+ response_text = await response.text()
50
+
51
+ return {
52
+ 'status': response.status,
53
+ 'success': response.status < 400,
54
+ 'response': response_text,
55
+ 'headers': dict(response.headers)
56
+ }
57
+
58
+
59
+ async def transcribe(_bytes: bytes) -> dict:
60
+ """
61
+ Transcribe audio bytes using Modal endpoint
62
+
63
+ Args:
64
+ _bytes: Audio file bytes
65
+
66
+ Returns:
67
+ Dictionary containing transcription results
68
+ """
69
+ auth_token = os.environ['MODAL_AUTH_TOKEN']
70
+ response = await _send_bytes_as_form_data(
71
+ file_bytes=_bytes,
72
+ endpoint_url='https://yigitsekerci6174--transcribe-audio.modal.run',
73
+ auth_token=auth_token,
74
+ field_name='file'
75
+ )
76
+
77
+ return json.loads(response['response'])
uv.lock ADDED
The diff for this file is too large to render. See raw diff