Spaces:

Agents-MCP-Hackathon
/

AudioEditor

Sleeping

App Files Files Community

Ahmet Emre Şafak commited on Jun 8

Commit

0a0ea7b

0 Parent(s):

initial commit

Browse files

Files changed (17) hide show

.gitattributes +35 -0
.gitignore +7 -0
.python-version +1 -0
README.md +14 -0
app.py +37 -0
assets/modal-logo.png +0 -0
pyproject.toml +16 -0
requirements.txt +101 -0
tabs/__init__.py +0 -0
tabs/audio_cutter_tab.py +189 -0
tabs/audio_effects_tab.py +549 -0
tabs/audio_merger_tab.py +195 -0
tabs/audio_transcription_tab.py +274 -0
utils/__init__.py +0 -0
utils/audio_utils.py +252 -0
utils/transcription_utils.py +77 -0
uv.lock +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+.env
+.venv/
+/.idea/
+.gradio/
+**/__pycache__/
+.DS_STORE
+**/.DS_STORE

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: Foo
+emoji: 📚
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 5.33.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: i'm trying to learn gradio
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import gradio as gr
+from dotenv import load_dotenv
+from tabs.audio_cutter_tab import create_audio_cutter_tab
+from tabs.audio_effects_tab import create_audio_effects_tab
+from tabs.audio_merger_tab import create_audio_merger_tab
+from tabs.audio_transcription_tab import create_audio_transcription_tab
+def create_app():
+    """Create the main Gradio application with multiple tabs"""
+    with gr.Blocks(title="Audio Toolkit", theme=gr.themes.Soft()) as app:
+        gr.Markdown("# 🎵 Audio Toolkit")
+        gr.Markdown("A comprehensive audio processing toolkit with multiple tools.")
+        with gr.Tabs():
+            # Audio Cutter Tab
+            with gr.TabItem("✂️ Audio Cutter"):
+                create_audio_cutter_tab()
+            # Placeholder for future tabs
+            with gr.TabItem("🔗 Audio Merger"):
+                create_audio_merger_tab()
+            with gr.TabItem("🎛️ Audio Effects"):
+                create_audio_effects_tab()
+            with gr.TabItem("📊 Audio Transcription"):
+                create_audio_transcription_tab()
+    return app
+if __name__ == "__main__":
+    load_dotenv()
+    gradio_app = create_app()
+    gradio_app.launch(mcp_server=True)

assets/modal-logo.png ADDED Viewed

pyproject.toml ADDED Viewed

	@@ -0,0 +1,16 @@

+[project]
+name = "AudioEditor"
+version = "0.1.0"
+description = "Edit your audio files with ease using this Gradio component."
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "aiohttp>=3.12.11",
+    "dotenv>=0.9.9",
+    "gradio>=5.33.0",
+    "gradio-audiogrid>=0.0.2",
+    "librosa",
+    "mcp",
+    "numpy>=2.3.0",
+    "soundfile>=0.13.1",
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,101 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.11
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+audioread==3.0.1
+backports-tarfile==1.2.0
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+decorator==5.2.1
+docutils==0.21.2
+dotenv==0.9.9
+fastapi==0.115.12
+ffmpy==0.6.0
+filelock==3.18.0
+frozenlist==1.6.2
+fsspec==2025.5.1
+gradio==5.33.0
+gradio-audiogrid==0.0.2
+gradio-client==1.10.2
+groovy==0.1.2
+h11==0.16.0
+hf-xet==1.1.3
+httpcore==1.0.9
+httpx==0.28.1
+httpx-sse==0.4.0
+huggingface-hub==0.32.4
+id==1.5.0
+idna==3.10
+importlib-metadata==8.7.0
+jaraco-classes==3.4.0
+jaraco-context==6.0.1
+jaraco-functools==4.1.0
+jinja2==3.1.6
+joblib==1.5.1
+keyring==25.6.0
+lazy-loader==0.4
+librosa==0.11.0
+llvmlite==0.44.0
+markdown-it-py==3.0.0
+markupsafe==3.0.2
+mcp==1.9.3
+mdurl==0.1.2
+more-itertools==10.7.0
+msgpack==1.1.0
+multidict==6.4.4
+nh3==0.2.21
+numba==0.61.2
+numpy==2.2.6
+orjson==3.10.18
+packaging==25.0
+pandas==2.3.0
+pillow==11.2.1
+platformdirs==4.3.8
+pooch==1.8.2
+propcache==0.3.1
+pycparser==2.22
+pydantic==2.11.5
+pydantic-core==2.33.2
+pydantic-settings==2.9.1
+pydub==0.25.1
+pygments==2.19.1
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.0
+python-multipart==0.0.20
+pytz==2025.2
+pyyaml==6.0.2
+readme-renderer==44.0
+requests==2.32.3
+requests-toolbelt==1.0.0
+rfc3986==2.0.0
+rich==14.0.0
+ruff==0.11.13
+safehttpx==0.1.6
+scikit-learn==1.7.0
+scipy==1.15.3
+semantic-version==2.10.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soundfile==0.13.1
+soxr==0.5.0.post1
+sse-starlette==2.3.6
+starlette==0.46.2
+threadpoolctl==3.6.0
+tomlkit==0.13.3
+tqdm==4.67.1
+twine==6.1.0
+typer==0.16.0
+typing-extensions==4.14.0
+typing-inspection==0.4.1
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.3
+websockets==15.0.1
+yarl==1.20.0
+zipp==3.22.0

tabs/__init__.py ADDED Viewed

File without changes

tabs/audio_cutter_tab.py ADDED Viewed

	@@ -0,0 +1,189 @@

+# tabs/audio_cutter_tab.py - Audio Cutter Tab Component
+import gradio as gr
+from numpy import ndarray
+from utils.audio_utils import cut_audio, format_time, load_audio_info
+def update_duration_info(audio_file):
+    """Update the duration info when a new file is uploaded
+        This function is designed for UI purposes only to update Gradio interface elements
+        when a new audio file is uploaded. It should NOT be used by agents or automated
+        systems as it returns Gradio update objects for slider components. Agents should
+        use the underlying audio utility functions directly instead.
+        """
+    print("Logging audiofile")
+    print(audio_file)
+    if audio_file is None:
+        return "No file uploaded", "Sample rate: N/A", gr.update(maximum=100), gr.update(maximum=100)
+    # Load audio info
+    audio_data, sample_rate, duration = load_audio_info(audio_file)
+    if duration is None:
+        return "❌ Could not read audio file", "Sample rate: N/A", gr.update(maximum=100), gr.update(maximum=100)
+    duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
+    sample_rate_text = f"🎵 Sample rate: {sample_rate:,} Hz"
+    # Update sliders with new maximum
+    return duration_text, sample_rate_text, gr.update(maximum=duration, value=0), gr.update(maximum=duration,
+                                                                                            value=min(30, duration))
+def process_cut_audio(audio_file: str, _start_time: float, _end_time: float) -> tuple[tuple[int | float, ndarray] | None, str]:
+    """Process audio cutting to extract a segment from an audio file.
+    This function loads an audio file, validates the time parameters, and cuts out
+    a specific segment between the start and end times. It handles various audio
+    formats and provides detailed error messages for troubleshooting.
+    Args:
+        audio_file (str): Full URL to the input audio file to be cut
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        _start_time (float): Start time in seconds where the cut should begin
+                           (will be clamped to 0 if negative)
+        _end_time (float): End time in seconds where the cut should end
+                         (will be clamped to file duration if exceeds it)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, audio_data: array)
+              for the cut audio segment, or None if an error occurred
+            - Second element: A status message string indicating success with details
+              or error information
+    Example:
+        result, status = process_cut_audio("/path/to/audio.mp3", 10.5, 25.0)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Cut successful: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Time parameters are automatically validated and clamped to valid ranges
+        - Start time must be less than end time after validation
+        - Output audio data maintains the original sample rate
+        - Function returns user-friendly status messages for UI display
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        # Load audio data and sample rate
+        audio_data, sample_rate, duration = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Validate time inputs
+        if _start_time < 0:
+            _start_time = 0
+        if _end_time > duration:
+            _end_time = duration
+        if _start_time >= _end_time:
+            return None, f"Start time ({_start_time:.1f}s) must be less than end time ({_end_time:.1f}s)"
+        # Convert seconds to milliseconds for the cut_audio function
+        start_millis = int(_start_time * 1000)
+        end_millis = int(_end_time * 1000)
+        # Cut the audio using your function
+        cut_audio_data = cut_audio(audio_data, sample_rate, start_millis, end_millis)
+        # Create status message
+        cut_duration = (end_millis - start_millis) / 1000.0
+        status = f"✅ Audio cut successfully! Duration: {format_time(cut_duration)} (from {format_time(_start_time)} to {format_time(_end_time)})"
+        return (sample_rate, cut_audio_data,), status
+    except Exception as e:
+        return None, f"❌ Error cutting audio: {str(e)}"
+def create_audio_cutter_tab():
+    """Create the audio cutter tab interface"""
+    gr.Markdown("Upload an audio file and specify the start and end times to cut a segment.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # File upload
+            audio_input = gr.Audio(
+                label="📤 Upload Audio File",
+                type="filepath"
+            )
+            # Audio info
+            duration_info = gr.Markdown("No file uploaded")
+            sample_rate_info = gr.Markdown("Sample rate: N/A")
+            # Time controls
+            with gr.Row():
+                start_time = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=0,
+                    step=0.1,
+                    label="⏱️ Start Time (seconds)",
+                    info="When to start cutting"
+                )
+                end_time = gr.Slider(
+                    minimum=0,
+                    maximum=100,
+                    value=30,
+                    step=0.1,
+                    label="⏱️ End Time (seconds)",
+                    info="When to stop cutting"
+                )
+            # Cut button
+            cut_btn = gr.Button("✂️ Cut Audio", variant="primary", size="lg")
+            # Status message
+            status_msg = gr.Markdown("")
+        with gr.Column(scale=1):
+            # Output audio
+            audio_output = gr.Audio(
+                label="🎧 Cut Audio Result",
+                type="filepath"
+            )
+            # Download info
+            gr.Markdown(
+                "💾 **Download:** Right-click the audio player above and select 'Save audio as...' to download the cut audio file.")
+    # Event handlers
+    audio_input.change(
+        fn=update_duration_info,
+        inputs=[audio_input],
+        outputs=[duration_info, sample_rate_info, start_time, end_time]
+    )
+    cut_btn.click(
+        fn=process_cut_audio,
+        inputs=[audio_input, start_time, end_time],
+        outputs=[audio_output, status_msg]
+    )
+    # Usage tips
+    with gr.Accordion("📋 Usage Tips", open=False):
+        gr.Markdown("""
+        **Supported formats:** MP3, WAV, M4A, FLAC, OGG, and more
+        **How to use:**
+        1. Upload your audio file
+        2. Check the duration and sample rate information
+        3. Use the sliders to set start and end times
+        4. Click "Cut Audio" to process
+        5. Play the result and download if satisfied
+        **Tips:**
+        - The sliders will automatically adjust to your file's duration
+        - Sample rate is preserved in the output file
+        - You can fine-tune times using the slider or type exact values
+        - Output format is WAV for best compatibility
+        """)

tabs/audio_effects_tab.py ADDED Viewed

	@@ -0,0 +1,549 @@

+# tabs/audio_effects_tab.py - Audio Effects Tab Component
+import gradio as gr
+import numpy as np
+from utils.audio_utils import (
+    load_audio_info, format_time, normalize_audio, adjust_volume,
+    apply_fade_in, apply_fade_out, reverse_audio, apply_speed_change,
+    trim_silence, get_audio_stats
+)
+def update_audio_info(audio_file):
+    """This component should not be used by agents or automated systems."""
+    if audio_file is None:
+        return "No file uploaded", "Audio stats: N/A"
+    audio_data, sample_rate, duration = load_audio_info(audio_file)
+    if audio_data is None:
+        return "❌ Could not read audio file", "Audio stats: N/A"
+    # Get audio statistics
+    stats = get_audio_stats(audio_data, sample_rate)
+    duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
+    stats_text = f"🎵 Sample rate: {sample_rate:,} Hz | Peak: {stats['peak_level_db']:.1f} dB | RMS: {stats['rms_level_db']:.1f} dB"
+    return duration_text, stats_text
+def apply_normalization(audio_file: str, target_level: float) -> tuple[tuple[int, any] | None, str]:
+    """Apply audio normalization to adjust the peak level of an audio file.
+    This function loads an audio file and applies normalization to adjust the peak
+    audio level to a specified target level in decibels (dB). It provides before
+    and after statistics to show the effect of the normalization process.
+    Args:
+        audio_file (str): Full url to the input audio file to be normalized
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        target_level (float): Target peak level in decibels (dB) for normalization
+                             (typical values: -3dB to -12dB for optimal loudness,
+                             negative values reduce volume, positive values increase)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, normalized_audio_data: array)
+              for the normalized audio result, or None if an error occurred
+            - Second element: A status message string showing before/after peak levels
+              and success/error information
+    Example:
+        result, status = apply_normalization("url/to/audio.mp3", -6.0)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Normalization successful: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Target level is specified in decibels (dB)
+        - Common target levels: -6dB (moderate), -3dB (loud), -12dB (quiet)
+        - Positive target levels will amplify audio and may cause clipping
+        - Negative target levels will reduce audio volume
+        - Function preserves original sample rate and audio format
+        - Returns comparison statistics showing original vs normalized peak levels
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, _ = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Apply normalization
+        normalized_audio = normalize_audio(audio_data, target_level)
+        # Get stats for before/after comparison
+        original_stats = get_audio_stats(audio_data, sample_rate)
+        new_stats = get_audio_stats(normalized_audio, sample_rate)
+        status = f"✅ Normalization applied! Peak: {original_stats['peak_level_db']:.1f}dB → {new_stats['peak_level_db']:.1f}dB"
+        return (sample_rate, normalized_audio), status
+    except Exception as e:
+        return None, f"❌ Error applying normalization: {str(e)}"
+def apply_volume_adjustment(audio_file: str, gain_db: float) -> tuple[tuple[int, any] | None, str]:
+    """Apply volume adjustment to an audio file using gain in decibels.
+    This function loads an audio file and applies a volume adjustment by the specified
+    gain amount in decibels. Positive values increase volume, negative values decrease
+    volume. The function also detects potential audio clipping when volume is increased.
+    Args:
+        audio_file (str): Full URL to the input audio file to be processed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        gain_db (float): Volume adjustment in decibels (dB)
+                        (positive values increase volume, negative values decrease volume,
+                         typical range: -20dB to +20dB, values above +6dB may cause clipping)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, adjusted_audio_data: array)
+              for the volume-adjusted audio result, or None if an error occurred
+            - Second element: A status message string indicating success with gain applied
+              and clipping warning if detected, or error information
+    Example:
+        result, status = apply_volume_adjustment("url/to/audio.mp3", -3.0)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Volume adjustment successful: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Gain is specified in decibels (dB): +6dB doubles volume, -6dB halves volume
+        - Positive gain values may cause clipping (distortion) if audio becomes too loud
+        - Function automatically detects and warns about clipping
+        - Preserves original sample rate and audio format
+        - Safe range is typically -20dB to +6dB to avoid quality issues
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, _ = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Apply volume adjustment
+        adjusted_audio = adjust_volume(audio_data, gain_db)
+        # Check for clipping
+        if np.max(np.abs(adjusted_audio)) > 1.0:
+            status = f"⚠️ Volume adjusted by {gain_db:+.1f}dB (WARNING: Clipping detected!)"
+        else:
+            status = f"✅ Volume adjusted by {gain_db:+.1f}dB"
+        return (sample_rate, adjusted_audio), status
+    except Exception as e:
+        return None, f"❌ Error adjusting volume: {str(e)}"
+def apply_fades(audio_file: str, fade_in_ms: int, fade_out_ms: int) -> tuple[tuple[int, any] | None, str]:
+    """Apply fade-in and fade-out effects to an audio file.
+    This function loads an audio file and applies smooth fade-in and/or fade-out effects
+    to eliminate abrupt starts/stops and create professional-sounding transitions.
+    Fade effects gradually increase or decrease volume over the specified time periods.
+    Args:
+        audio_file (str): Full URL to the input audio file to be processed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        fade_in_ms (int): Duration of fade-in effect in milliseconds
+                         (0 = no fade-in, typical values: 100-3000ms)
+        fade_out_ms (int): Duration of fade-out effect in milliseconds
+                          (0 = no fade-out, typical values: 100-3000ms)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, faded_audio_data: array)
+              for the fade-processed audio result, or None if an error occurred
+            - Second element: A status message string showing applied fade durations
+              or error information
+    Example:
+        result, status = apply_fades("url/to/audio.mp3", 1000, 2000)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Fades applied: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Fade durations are specified in milliseconds (1000ms = 1 second)
+        - Set either parameter to 0 to skip that fade effect
+        - Fade-in gradually increases volume from silence at the beginning
+        - Fade-out gradually decreases volume to silence at the end
+        - Typical fade durations: 100-500ms (quick), 1000-3000ms (smooth)
+        - Preserves original sample rate and audio format
+        - Fades are applied as smooth linear or exponential curves
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, _ = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        processed_audio = audio_data.copy()
+        # Apply fade in
+        if fade_in_ms > 0:
+            processed_audio = apply_fade_in(processed_audio, sample_rate, fade_in_ms)
+        # Apply fade out
+        if fade_out_ms > 0:
+            processed_audio = apply_fade_out(processed_audio, sample_rate, fade_out_ms)
+        status = f"✅ Fades applied! Fade in: {fade_in_ms}ms, Fade out: {fade_out_ms}ms"
+        return (sample_rate, processed_audio), status
+    except Exception as e:
+        return None, f"❌ Error applying fades: {str(e)}"
+def apply_reverse(audio_file: str) -> tuple[tuple[int, any] | None, str]:
+    """Reverse the playback direction of an audio file.
+    This function loads an audio file and reverses the audio data so that it plays
+    backwards. This creates a reverse playback effect commonly used for artistic
+    purposes, sound design, or audio analysis.
+    Args:
+        audio_file (str): Full URL to the input audio file to be reversed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, reversed_audio_data: array)
+              for the reversed audio result, or None if an error occurred
+            - Second element: A status message string indicating successful reversal
+              or error information
+    Example:
+        result, status = apply_reverse("url/to/audio.mp3")
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Audio reversed: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Reverses the entire audio file from end to beginning
+        - Preserves original sample rate, duration, and audio quality
+        - Commonly used for creative effects, sound design, or subliminal messaging detection
+        - The reversed audio will have the same duration as the original
+        - All audio characteristics (pitch, timbre) are preserved but played backwards
+        - Works with both mono and stereo audio files
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, _ = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Reverse audio
+        reversed_audio = reverse_audio(audio_data)
+        status = "✅ Audio reversed successfully!"
+        return (sample_rate, reversed_audio), status
+    except Exception as e:
+        return None, f"❌ Error reversing audio: {str(e)}"
+def apply_speed_adjustment(audio_file: str, speed_factor: float) -> tuple[tuple[int, any] | None, str]:
+    """Apply speed adjustment to an audio file, changing playback speed and pitch.
+    This function loads an audio file and adjusts its playback speed by the specified
+    factor. Speed changes affect both duration and pitch - faster speeds increase pitch
+    and reduce duration, while slower speeds decrease pitch and increase duration.
+    Args:
+        audio_file (str): Full URL to the input audio file to be processed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        speed_factor (float): Speed multiplication factor
+                             (1.0 = normal speed, 2.0 = double speed/half duration,
+                              0.5 = half speed/double duration, typical range: 0.25 to 4.0)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, speed_adjusted_audio_data: array)
+              for the speed-adjusted audio result, or None if an error occurred
+            - Second element: A status message string showing speed factor and duration change
+              or error information
+    Example:
+        result, status = apply_speed_adjustment("url/to/audio.mp3", 1.5)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Speed adjusted: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Speed factor affects both playback speed and pitch (chipmunk/slow-motion effect)
+        - Values > 1.0 increase speed and pitch, reduce duration
+        - Values < 1.0 decrease speed and pitch, increase duration
+        - Common values: 0.5 (half speed), 1.25 (25% faster), 2.0 (double speed)
+        - Extreme values (< 0.25 or > 4.0) may result in poor audio quality
+        - For pitch-preserving speed changes, use time-stretching instead
+        - Preserves original sample rate but changes audio duration
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, duration = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Apply speed change
+        speed_adjusted_audio = apply_speed_change(audio_data, speed_factor)
+        new_duration = len(speed_adjusted_audio) / sample_rate
+        status = f"✅ Speed adjusted by {speed_factor}x! Duration: {format_time(duration)} → {format_time(new_duration)}"
+        return (sample_rate, speed_adjusted_audio), status
+    except Exception as e:
+        return None, f"❌ Error adjusting speed: {str(e)}"
+def apply_silence_trimming(audio_file: str, threshold_db: float) -> tuple[tuple[int, any] | None, str]:
+    """Trim silence from the beginning and end of an audio file.
+    This function loads an audio file and automatically removes silent or very quiet
+    sections from the beginning and end based on a specified volume threshold.
+    This is useful for cleaning up recordings and removing unwanted quiet sections.
+    Args:
+        audio_file (str): Full URL to the input audio file to be processed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common formats)
+        threshold_db (float): Volume threshold in decibels below which audio is considered silence
+                             (typical values: -30dB to -60dB, lower values = more aggressive trimming,
+                              -40dB is a good starting point for most recordings)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, trimmed_audio_data: array)
+              for the silence-trimmed audio result, or None if an error occurred
+            - Second element: A status message string showing original and new duration
+              or error information
+    Example:
+        result, status = apply_silence_trimming("url/to/audio.mp3", -40.0)
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Silence trimmed: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Threshold is specified in decibels (dB) - more negative values = quieter threshold
+        - Common thresholds: -30dB (conservative), -40dB (moderate), -60dB (aggressive)
+        - Only trims from beginning and end, preserves silence within the audio
+        - Useful for removing recording artifacts, room tone, or equipment noise
+        - May significantly reduce file duration depending on original content
+        - Preserves original sample rate and audio quality
+        - Be careful with very low thresholds as they may trim wanted quiet content
+    """
+    if audio_file is None:
+        return None, "Please upload an audio file first."
+    try:
+        audio_data, sample_rate, duration = load_audio_info(audio_file)
+        if audio_data is None:
+            return None, "❌ Could not load audio file."
+        # Trim silence
+        trimmed_audio = trim_silence(audio_data, threshold_db)
+        new_duration = len(trimmed_audio) / sample_rate
+        status = f"✅ Silence trimmed! Duration: {format_time(duration)} → {format_time(new_duration)}"
+        return (sample_rate, trimmed_audio), status
+    except Exception as e:
+        return None, f"❌ Error trimming silence: {str(e)}"
+def create_audio_effects_tab():
+    """Create the audio effects tab interface"""
+    gr.Markdown("Apply various audio effects and processing to your audio files.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # File upload
+            audio_input = gr.Audio(
+                label="📤 Upload Audio File",
+                type="filepath"
+            )
+            # Audio info
+            duration_info = gr.Markdown("No file uploaded")
+            stats_info = gr.Markdown("Audio stats: N/A")
+            # Effects sections
+            with gr.Accordion("🔊 Volume & Normalization", open=True):
+                with gr.Row():
+                    normalize_btn = gr.Button("📏 Normalize Audio", variant="secondary")
+                    target_level = gr.Slider(
+                        minimum=-20,
+                        maximum=0,
+                        value=-3,
+                        step=0.1,
+                        label="Target Level (dB)"
+                    )
+                with gr.Row():
+                    volume_btn = gr.Button("🔊 Adjust Volume", variant="secondary")
+                    gain_db = gr.Slider(
+                        minimum=-20,
+                        maximum=20,
+                        value=0,
+                        step=0.1,
+                        label="Volume Gain (dB)"
+                    )
+            with gr.Accordion("🎭 Fade Effects", open=True):
+                with gr.Row():
+                    fade_btn = gr.Button("📈 Apply Fades", variant="secondary")
+                    fade_in_ms = gr.Slider(
+                        minimum=0,
+                        maximum=5000,
+                        value=100,
+                        step=10,
+                        label="Fade In (ms)"
+                    )
+                    fade_out_ms = gr.Slider(
+                        minimum=0,
+                        maximum=5000,
+                        value=100,
+                        step=10,
+                        label="Fade Out (ms)"
+                    )
+            with gr.Accordion("⚡ Time & Speed Effects", open=True):
+                with gr.Row():
+                    reverse_btn = gr.Button("↩️ Reverse Audio", variant="secondary")
+                    speed_btn = gr.Button("⏩ Change Speed", variant="secondary")
+                    speed_factor = gr.Slider(
+                        minimum=0.25,
+                        maximum=4.0,
+                        value=1.0,
+                        step=0.1,
+                        label="Speed Factor"
+                    )
+            with gr.Accordion("✂️ Audio Cleanup", open=True):
+                with gr.Row():
+                    trim_btn = gr.Button("🔇 Trim Silence", variant="secondary")
+                    threshold_db = gr.Slider(
+                        minimum=-60,
+                        maximum=-10,
+                        value=-40,
+                        step=1,
+                        label="Silence Threshold (dB)"
+                    )
+            # Status message
+            status_msg = gr.Markdown("")
+        with gr.Column(scale=1):
+            # Output audio
+            audio_output = gr.Audio(
+                label="🎧 Processed Audio Result",
+                type="numpy"
+            )
+            # Download info
+            gr.Markdown("💾 **Download:** Right-click the audio player above and select 'Save audio as...'")
+    # Event handlers
+    audio_input.change(
+        fn=update_audio_info,
+        inputs=[audio_input],
+        outputs=[duration_info, stats_info]
+    )
+    # Normalization
+    normalize_btn.click(
+        fn=apply_normalization,
+        inputs=[audio_input, target_level],
+        outputs=[audio_output, status_msg]
+    )
+    # Volume adjustment
+    volume_btn.click(
+        fn=apply_volume_adjustment,
+        inputs=[audio_input, gain_db],
+        outputs=[audio_output, status_msg]
+    )
+    # Fades
+    fade_btn.click(
+        fn=apply_fades,
+        inputs=[audio_input, fade_in_ms, fade_out_ms],
+        outputs=[audio_output, status_msg]
+    )
+    # Reverse
+    reverse_btn.click(
+        fn=apply_reverse,
+        inputs=[audio_input],
+        outputs=[audio_output, status_msg]
+    )
+    # Speed change
+    speed_btn.click(
+        fn=apply_speed_adjustment,
+        inputs=[audio_input, speed_factor],
+        outputs=[audio_output, status_msg]
+    )
+    # Trim silence
+    trim_btn.click(
+        fn=apply_silence_trimming,
+        inputs=[audio_input, threshold_db],
+        outputs=[audio_output, status_msg]
+    )
+    # Usage tips
+    with gr.Accordion("📋 Effects Guide", open=False):
+        gr.Markdown("""
+        **🔊 Volume & Normalization:**
+        - **Normalize**: Adjusts peak level to target dB (recommended: -3dB)
+        - **Volume Gain**: Increase/decrease volume by specified dB
+        **🎭 Fade Effects:**
+        - **Fade In**: Gradually increase volume from silence
+        - **Fade Out**: Gradually decrease volume to silence
+        **⚡ Time & Speed:**
+        - **Reverse**: Play audio backwards
+        - **Speed**: Change playback speed (1.0 = normal, 2.0 = double, 0.5 = half)
+        **✂️ Cleanup:**
+        - **Trim Silence**: Remove quiet sections from start/end
+        **Tips:**
+        - Always check audio stats before processing
+        - Watch for clipping warnings when increasing volume
+        - Use normalization for consistent levels across multiple files
+        - Combine effects by processing sequentially
+        """)

tabs/audio_merger_tab.py ADDED Viewed

	@@ -0,0 +1,195 @@

+# tabs/audio_merger_tab.py - Audio Merger Tab Component
+import gradio as gr
+from gradio_audiogrid import AudioGrid
+from numpy import ndarray
+from utils.audio_utils import load_audio_info, format_time, merge_audio_files
+def update_file_status(file_paths):
+    """Update the duration info when a new file is uploaded
+    This function is designed for UI purposes only to update Gradio interface elements
+    when a new audio file is uploaded. It should NOT be used by agents or automated
+    systems as it returns Gradio update objects for slider components. Agents should
+    use the underlying audio utility functions directly instead.
+    """
+    if not file_paths or len(file_paths) == 0:
+        return "No files uploaded yet", "🔄 Ready to upload audio files"
+    # Get info about uploaded files
+    total_duration = 0
+    valid_files = 0
+    file_info = []
+    for i, file_path in enumerate(file_paths):
+        try:
+            audio_data, sample_rate, duration = load_audio_info(file_path)
+            if audio_data is not None:
+                valid_files += 1
+                total_duration += duration
+                file_info.append(f"  {i + 1}. {duration:.1f}s ({sample_rate:,} Hz)")
+        except:
+            file_info.append(f"  {i + 1}. ❌ Invalid file")
+    if valid_files == 0:
+        status = "❌ No valid audio files found"
+        details = "Please upload valid audio files (MP3, WAV, FLAC, etc.)"
+    elif valid_files == 1:
+        status = f"📁 1 valid file uploaded ({format_time(total_duration)})"
+        details = "Add at least one more file to enable merging"
+    else:
+        status = f"📁 {valid_files} files ready ({format_time(total_duration)} total)"
+        details = f"Files in merge order:\n" + "\n".join(file_info[:5])  # Show first 5
+        if len(file_info) > 5:
+            details += f"\n  ... and {len(file_info) - 5} more files"
+    return status, details
+def process_merge(file_paths: list[str]) -> tuple[tuple[int, ndarray] | None, str]:
+    """Process the merging of multiple audio files into a single continuous audio file.
+    This function takes a list of audio file URLs and merges them sequentially into
+    one continuous audio file. It handles sample rate conversion, format normalization,
+    and provides detailed status information about the merge operation.
+    Args:
+        file_paths (list[str]): List of full URLs to audio files to be merged
+                               (minimum 2 files required, supports MP3, WAV, M4A, FLAC, OGG, etc.)
+    Returns:
+        tuple: A tuple containing:
+            - First element: Either a tuple of (sample_rate: int, merged_audio_data: array)
+              for the merged audio result, or None if an error occurred
+            - Second element: A status message string with merge details and success/error info
+    Example:
+        result, status = process_merge(["url/to/file1.mp3", "url/to/file2.wav"])
+        if result is not None:
+            sample_rate, audio_data = result
+            print(f"Merge successful: {status}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Requires at least 2 audio files to perform merge operation
+        - Files are merged in the order provided in the list
+        - Automatically handles sample rate conversion to match the first file
+        - Converts stereo files to mono for consistency
+        - Returns detailed status with duration and file information
+        - Output maintains the sample rate of the first valid audio file
+    """
+    if not file_paths or len(file_paths) < 2:
+        return None, "❌ Please upload at least 2 audio files to merge"
+    # Call the merge function
+    result, status = merge_audio_files(file_paths)
+    return result, status
+def reset_everything():
+    """This component should not be used by agents or automated systems."""
+    return [], None, "No files uploaded yet", "🔄 Ready to upload audio files"
+def create_audio_merger_tab():
+    """Create the audio merger tab interface"""
+    gr.Markdown("Upload multiple audio files and merge them in sequence. Drag to reorder files before merging.")
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Audio Grid Upload
+            gr.Markdown("### 📁 Upload & Arrange Audio Files")
+            audio_files = AudioGrid(
+                value=[],
+                label="Drag files here or click to upload (supports MP3, WAV, FLAC, OGG, M4A, AAC)",
+                interactive=True,
+            )
+            # File status
+            file_status = gr.Textbox(
+                value="No files uploaded yet",
+                label="📊 Upload Status",
+                interactive=False,
+                lines=1
+            )
+            # Detailed file info
+            file_details = gr.Textbox(
+                value="🔄 Ready to upload audio files",
+                label="📋 File Details",
+                interactive=False,
+                lines=6
+            )
+        with gr.Column(scale=1):
+            gr.Markdown("### 🎛️ Merge Controls")
+            merge_btn = gr.Button(
+                "🎵 Merge Audio Files",
+                variant="primary",
+                size="lg"
+            )
+            clear_btn = gr.Button(
+                "🗑️ Clear All Files",
+                variant="secondary",
+                size="lg"
+            )
+            # Instructions
+            gr.Markdown("""
+            **📋 Instructions:**
+            1. **Upload** 2+ audio files using drag & drop
+            2. **Reorder** by dragging files in the grid
+            3. **Merge** files in the displayed order
+            4. **Download** the merged result
+            **🎯 Features:**
+            • Automatic sample rate conversion
+            • Stereo to mono conversion
+            • Duration calculations
+            • High-quality WAV output
+            """)
+    # Results section
+    with gr.Row():
+        with gr.Column(scale=1):
+            # Status output
+            merge_status = gr.Textbox(
+                value="Ready to merge! Upload at least 2 audio files to get started.",
+                label="🔍 Merge Status & Details",
+                interactive=False,
+                lines=8
+            )
+        with gr.Column(scale=1):
+            # Audio output
+            merged_audio = gr.Audio(
+                label="🎵 Merged Audio Result",
+                type="numpy",
+                interactive=False
+            )
+    # Event handlers
+    audio_files.change(
+        fn=update_file_status,
+        inputs=[audio_files],
+        outputs=[file_status, file_details]
+    )
+    merge_btn.click(
+        fn=process_merge,
+        inputs=[audio_files],
+        outputs=[merged_audio, merge_status]
+    )
+    clear_btn.click(
+        fn=reset_everything,
+        outputs=[audio_files, merged_audio, file_status, file_details]
+    )

tabs/audio_transcription_tab.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# tabs/audio_transcription_tab.py - Audio Transcription Tab Component
+import asyncio
+import json
+import gradio as gr
+from utils.audio_utils import load_audio_info, format_time
+from utils.transcription_utils import transcribe
+def update_transcription_info(audio_file):
+    """This should not be used by agents, only for UI updates"""
+    if audio_file is None:
+        return "No file uploaded", "Ready to transcribe"
+    audio_data, sample_rate, duration = load_audio_info(audio_file)
+    if audio_data is None:
+        return "❌ Could not read audio file", "File error"
+    duration_text = f"📁 File duration: {format_time(duration)} ({duration:.1f} seconds)"
+    status_text = f"🎵 Sample rate: {sample_rate:,} Hz | Ready for transcription"
+    return duration_text, status_text
+def format_transcription_segments(segments):
+    """Format transcription segments with timestamps"""
+    if not segments:
+        return "No segments found"
+    formatted_text = ""
+    for i, segment in enumerate(segments):
+        start_time = segment.get('start', 0)
+        end_time = segment.get('end', 0)
+        text = segment.get('text', '').strip()
+        if text:
+            formatted_text += f"**[{format_time(start_time)} - {format_time(end_time)}]**\n"
+            formatted_text += f"{text}\n\n"
+    return formatted_text
+def format_word_level_transcription(segments):
+    """Format word-level transcription with confidence scores"""
+    if not segments:
+        return "No word-level data available"
+    formatted_text = ""
+    for segment in segments:
+        words = segment.get('words', [])
+        if words:
+            for word in words:
+                word_text = word.get('word', '')
+                confidence = word.get('score', 0)
+                start_time = word.get('start', 0)
+                # Color code based on confidence
+                if confidence > 0.9:
+                    color = "green"
+                elif confidence > 0.7:
+                    color = "orange"
+                else:
+                    color = "red"
+                formatted_text += f'<span style="color: {color}; font-weight: bold;" title="Confidence: {confidence:.2f}, Time: {start_time:.1f}s">{word_text}</span> '
+            formatted_text += "\n\n"
+    return formatted_text
+def format_json_for_display(transcription_data):
+    """Format transcription data as pretty JSON string"""
+    return json.dumps(transcription_data, indent=2, ensure_ascii=False)
+async def process_transcription(audio_file):
+    """Process audio transcription"""
+    if audio_file is None:
+        return "Please upload an audio file first.", "", "", ""
+    try:
+        # Read audio file as bytes
+        with open(audio_file, 'rb') as f:
+            audio_bytes = f.read()
+        # Call transcription API
+        transcription_result = await transcribe(audio_bytes)
+        # Extract information
+        full_text = transcription_result.get('full_text', '')
+        segments = transcription_result.get('segments', [])
+        language = transcription_result.get('language_detected', 'Unknown')
+        processing_time = transcription_result.get('processing_time_seconds', 0)
+        # Format results
+        status = f"✅ Transcription completed! Language: {language} | Processing time: {processing_time:.1f}s"
+        # Create formatted outputs
+        segments_formatted = format_transcription_segments(segments)
+        # Format JSON for display
+        json_formatted = format_json_for_display(transcription_result)
+        return status, full_text, segments_formatted, json_formatted
+    except Exception as e:
+        return f"❌ Error during transcription: {str(e)}", "", "", ""
+def transcribe_audio_sync(audio_file: str) -> tuple[str, str, str, str]:
+    """Synchronously transcribe an audio file using AI-powered speech recognition.
+    This function provides a synchronous wrapper around the async transcription process,
+    converting audio files to text using advanced speech recognition. It handles the
+    async/await complexity internally and returns detailed transcription results including
+    the full text, timestamped segments, language detection, and processing statistics.
+    Args:
+        audio_file (str): Full URL to the input audio file to be transcribed
+                         (supports MP3, WAV, M4A, FLAC, OGG, and other common audio formats)
+    Returns:
+        tuple: A tuple containing four string elements:
+            - status (str): Status message indicating success with language and processing time,
+              or error information if transcription failed
+            - full_text (str): Complete transcription as plain text, or empty string on error
+            - segments_formatted (str): Formatted text showing timestamped segments with
+              start/end times and confidence scores, or empty string on error
+            - json_formatted (str): Pretty-formatted JSON string containing complete transcription
+              data including word-level timestamps and metadata, or empty string on error.
+              The JSON structure includes:
+              * "filename": original audio filename
+              * "language_detected": detected language code (e.g., "en", "es", "fr")
+              * "full_text": complete transcription text
+              * "segments": array of text segments with timing and word breakdowns
+              * "processing_time_seconds": time taken for transcription
+              Each segment contains: start/end times, text, and words array with individual
+              word timestamps and confidence scores (0.0-1.0 range)
+    Example:
+        status, text, segments, json_data = transcribe_audio_sync("url/to/audio.mp3")
+        if "✅" in status:
+            print(f"Success: {status}")
+            print(f"Transcription: {text}")
+            print(f"Segments: {segments}")
+        else:
+            print(f"Error: {status}")
+    Note:
+        - Automatically detects language in the audio file
+        - Provides word-level and segment-level timestamps for precise audio editing
+        - Returns confidence scores for quality assessment
+        - Handles various audio formats and sample rates automatically
+        - Processing time depends on audio length and complexity
+        - All timestamps are provided in seconds with decimal precision
+        - Function blocks until transcription is complete (synchronous)
+        - For async usage, use process_transcription() directly instead
+    """
+    try:
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        result = loop.run_until_complete(
+            process_transcription(audio_file)
+        )
+        loop.close()
+        return result
+    except Exception as e:
+        return f"❌ Error: {str(e)}", "", "", ""
+def create_audio_transcription_tab():
+    """Create the audio transcription tab interface"""
+    gr.Markdown("Upload an audio file to generate accurate transcriptions with timestamps and confidence scores.")
+    gr.Markdown("**Powered by Modal Labs**")
+    gr.Image(
+        value="assets/modal-logo.png",
+        show_label=False,
+        container=False,
+        show_fullscreen_button=False,
+        show_download_button=False,
+        width=200,
+        height=200
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            # File upload
+            audio_input = gr.Audio(
+                label="📤 Upload Audio File",
+                type="filepath"
+            )
+            # Audio info
+            duration_info = gr.Markdown("No file uploaded")
+            status_info = gr.Markdown("Ready to transcribe")
+            # Transcribe button
+            transcribe_btn = gr.Button("🎤 Start Transcription", variant="primary", size="lg")
+            # Status message
+            status_msg = gr.Markdown("")
+    # Results section
+    with gr.Row():
+        with gr.Column():
+            # Full transcription
+            full_text_output = gr.Textbox(
+                label="📝 Full Transcription",
+                lines=10,
+                max_lines=20,
+                placeholder="Transcription will appear here..."
+            )
+        with gr.Column():
+            # Segmented transcription with timestamps
+            segments_output = gr.Markdown(
+                label="⏱️ Timestamped Segments",
+                value="Segments with timestamps will appear here..."
+            )
+    # JSON Results section
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### 📄 JSON Results")
+            json_output = gr.Textbox(
+                label="Complete JSON Data",
+                lines=15,
+                max_lines=25,
+                placeholder="JSON transcription data will appear here...",
+                show_copy_button=True
+            )
+    # Event handlers
+    audio_input.change(
+        fn=update_transcription_info,
+        inputs=[audio_input],
+        outputs=[duration_info, status_info]
+    )
+    transcribe_btn.click(
+        fn=transcribe_audio_sync,
+        inputs=[audio_input],
+        outputs=[status_msg, full_text_output, segments_output, json_output]
+    )
+    # Usage tips
+    with gr.Accordion("📋 Transcription Guide", open=False):
+        gr.Markdown("""
+        **🎤 Supported Features:**
+        - **Multiple Languages**: Automatic language detection
+        - **High Accuracy**: Professional-grade transcription
+        - **Word Timestamps**: Precise timing for each word
+        - **Confidence Scores**: Quality indicators for each word
+        - **JSON Output**: Complete structured data
+        **📁 File Requirements:**
+        - **Formats**: MP3, WAV, M4A, FLAC, OGG, and more
+        - **Duration**: Best results with files under 10 minutes
+        - **Quality**: Clear audio produces better quality results
+        **💡 Tips:**
+        - Use high-quality audio for best results
+        - Consider splitting long files into segments
+        - Copy JSON data using the copy button for easy access
+        - JSON contains all metadata including word-level timestamps
+        **📊 JSON Structure:**
+        - **full_text**: Complete transcription text
+        - **segments**: Timestamped text segments
+        - **language_detected**: Detected language code
+        - **processing_time_seconds**: API processing duration
+        """)

utils/__init__.py ADDED Viewed

File without changes

utils/audio_utils.py ADDED Viewed

	@@ -0,0 +1,252 @@

+import librosa
+import numpy as np
+import soundfile as sf
+from numpy import ndarray
+def load_audio(file_path: str) -> tuple[ndarray, int]:
+    """
+    Load audio file and return audio data and sample rate.
+    Args:
+        file_path (str): Path to the audio file.
+    Returns:
+        tuple: (audio_data, sample_rate)
+    """
+    audio_data, sample_rate = sf.read(file_path)
+    return audio_data, sample_rate
+def cut_audio(_audio: ndarray, sampling_rate: int | float, start_millis: int, end_millis: int) -> ndarray:
+    """Cut audio array from start_millis to end_millis"""
+    start_sample = int(start_millis / 1000 * sampling_rate)
+    end_sample = int(end_millis / 1000 * sampling_rate)
+    return _audio[start_sample:end_sample]
+def format_time(seconds):
+    """Format seconds into MM:SS format"""
+    minutes = int(seconds // 60)
+    secs = int(seconds % 60)
+    return f"{minutes:02d}:{secs:02d}"
+def load_audio_info(audio_file):
+    """Load audio file and return audio data, sample rate, and duration info"""
+    if audio_file is None:
+        return None, None, None
+    try:
+        # Load audio data and sample rate
+        audio_data, sample_rate = sf.read(audio_file)
+        # Calculate duration
+        duration = len(audio_data) / sample_rate
+        return audio_data, sample_rate, duration
+    except Exception as e:
+        print(f"Error loading audio: {e}")
+        return None, None, None
+def get_audio_duration(audio_file):
+    """Get just the duration of an audio file"""
+    try:
+        info = sf.info(audio_file)
+        return info.frames / info.samplerate
+    except Exception:
+        return None
+def merge_audio_arrays(audios: list[ndarray]) -> ndarray:
+    """Merge multiple audio arrays by concatenation"""
+    return np.concatenate(audios)
+def apply_fade_in(audio: ndarray, sample_rate: int, fade_duration_ms: int = 100) -> ndarray:
+    """Apply fade in effect to audio"""
+    fade_samples = int(fade_duration_ms / 1000 * sample_rate)
+    fade_samples = min(fade_samples, len(audio))
+    fade_curve = np.linspace(0, 1, fade_samples)
+    audio_copy = audio.copy()
+    audio_copy[:fade_samples] *= fade_curve
+    return audio_copy
+def apply_fade_out(audio: ndarray, sample_rate: int, fade_duration_ms: int = 100) -> ndarray:
+    """Apply fade out effect to audio"""
+    fade_samples = int(fade_duration_ms / 1000 * sample_rate)
+    fade_samples = min(fade_samples, len(audio))
+    fade_curve = np.linspace(1, 0, fade_samples)
+    audio_copy = audio.copy()
+    audio_copy[-fade_samples:] *= fade_curve
+    return audio_copy
+def normalize_audio(audio: ndarray, target_level: float = -3.0) -> ndarray:
+    """
+    Normalize audio to target level in dB
+    target_level: Target peak level in dB (e.g., -3.0 for -3dB)
+    """
+    # Calculate current peak level
+    peak = np.max(np.abs(audio))
+    if peak == 0:
+        return audio  # Avoid division by zero for silent audio
+    # Convert target level from dB to linear scale
+    target_linear = 10 ** (target_level / 20)
+    # Calculate gain needed
+    gain = target_linear / peak
+    return audio * gain
+def adjust_volume(audio: ndarray, gain_db: float) -> ndarray:
+    """
+    Adjust audio volume by specified gain in dB
+    gain_db: Gain in decibels (positive = louder, negative = quieter)
+    """
+    gain_linear = 10 ** (gain_db / 20)
+    return audio * gain_linear
+def apply_silence(duration_ms: int, sample_rate: int) -> ndarray:
+    """Generate silence for specified duration"""
+    samples = int(duration_ms / 1000 * sample_rate)
+    return np.zeros(samples)
+def reverse_audio(audio: ndarray) -> ndarray:
+    """Reverse audio playback"""
+    return np.flip(audio)
+def apply_speed_change(audio: ndarray, speed_factor: float) -> ndarray:
+    """
+    Change playback speed without changing pitch (simple time-stretching)
+    speed_factor: 1.0 = normal, 2.0 = double speed, 0.5 = half speed
+    """
+    return librosa.effects.time_stretch(audio, rate=speed_factor)
+def trim_silence(audio: ndarray, threshold_db: float = -40.0) -> ndarray:
+    """
+    Trim silence from beginning and end of audio
+    threshold_db: Silence threshold in dB
+    """
+    # Convert threshold to linear scale
+    threshold_linear = 10 ** (threshold_db / 20)
+    # Find non-silent regions
+    non_silent = np.abs(audio) > threshold_linear
+    if not np.any(non_silent):
+        return audio  # All audio is below threshold
+    # Find first and last non-silent samples
+    first_non_silent = np.where(non_silent)[0][0]
+    last_non_silent = np.where(non_silent)[0][-1]
+    return audio[first_non_silent:last_non_silent + 1]
+def get_audio_stats(audio: ndarray, sample_rate: int) -> dict:
+    """Get statistics about the audio"""
+    peak_level = np.max(np.abs(audio))
+    rms_level = np.sqrt(np.mean(audio ** 2))
+    # Convert to dB
+    peak_db = 20 * np.log10(peak_level) if peak_level > 0 else -np.inf
+    rms_db = 20 * np.log10(rms_level) if rms_level > 0 else -np.inf
+    return {
+        'duration_seconds': len(audio) / sample_rate,
+        'peak_level_db': peak_db,
+        'rms_level_db': rms_db,
+        'sample_rate': sample_rate,
+        'samples': len(audio),
+        'channels': 1 if len(audio.shape) == 1 else audio.shape[1]
+    }
+def merge_audio_files(file_paths: list[str]) -> tuple[tuple[ndarray, int | float] | None, str]:
+    """
+    Merge multiple audio files by concatenating them
+    Args:
+        file_paths: List of audio file paths
+    Returns:
+        tuple: (sample_rate, merged_audio_array, status_message)
+    """
+    if not file_paths or len(file_paths) == 0:
+        return None, "❌ No audio files to merge"
+    if len(file_paths) == 1:
+        return None, "❌ Please upload at least 2 audio files to merge"
+    try:
+        merged_audio_segments = []
+        target_sample_rate = None
+        file_durations = []
+        for i, file_path in enumerate(file_paths):
+            # Load audio file
+            audio_data, sample_rate, duration = load_audio_info(file_path)
+            if audio_data is None:
+                continue
+            # Set target sample rate from first file
+            if target_sample_rate is None:
+                target_sample_rate = sample_rate
+            elif sample_rate != target_sample_rate:
+                # Resample if different sample rate
+                from scipy import signal
+                num_samples = int(len(audio_data) * target_sample_rate / sample_rate)
+                audio_data = signal.resample(audio_data, num_samples)
+            # Convert stereo to mono if needed
+            if len(audio_data.shape) > 1:
+                audio_data = np.mean(audio_data, axis=1)
+            merged_audio_segments.append(audio_data)
+            file_durations.append(len(audio_data) / target_sample_rate)
+        if not merged_audio_segments:
+            return None, "❌ No valid audio files found"
+        # Concatenate all audio arrays
+        final_audio = np.concatenate(merged_audio_segments)
+        # Create status message
+        total_duration = len(final_audio) / target_sample_rate
+        status = f"""✅ Successfully merged {len(file_paths)} audio files!
+🎵 **Merge Details:**
+• Total duration: {format_time(total_duration)} ({total_duration:.2f} seconds)
+• Sample rate: {target_sample_rate:,} Hz
+• Files processed: {len(merged_audio_segments)}
+• Individual durations: {', '.join([f'{d:.1f}s' for d in file_durations])}
+🎧 **Result:** Ready for playback and download!"""
+        return (target_sample_rate, final_audio), status
+    except Exception as e:
+        return None, f"❌ Error merging audio files: {str(e)}"

utils/transcription_utils.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# utils/transcription_utils.py - Audio Transcription Utilities
+import json
+import os
+from typing import Optional, Dict, Any
+import aiohttp
+async def _send_bytes_as_form_data(
+        file_bytes: bytes,
+        endpoint_url: str,
+        field_name: str = "file",
+        auth_token: Optional[str] = None,
+        content_type: str = "application/octet-stream"
+) -> Dict[str, Any]:
+    """
+    Send bytes as multipart form data POST request to an endpoint.
+    Args:
+        file_bytes: Bytes content to send
+        endpoint_url: URL endpoint to send the POST request to
+        field_name: Form field name for the file (default: "file")
+        auth_token: Optional bearer token for authorization
+        content_type: MIME type of the content (default: "application/octet-stream")
+    Returns:
+        Dictionary containing response status and data
+    """
+    # Create form data with the bytes
+    data = aiohttp.FormData()
+    data.add_field(
+        field_name,
+        file_bytes,
+        content_type=content_type
+    )
+    # Prepare headers
+    headers = {}
+    if auth_token:
+        headers['Authorization'] = f'Bearer {auth_token}'
+    # Send POST request with form data
+    async with aiohttp.ClientSession() as session:
+        async with session.post(
+                endpoint_url,
+                data=data,
+                headers=headers if headers else None
+        ) as response:
+            response_text = await response.text()
+            return {
+                'status': response.status,
+                'success': response.status < 400,
+                'response': response_text,
+                'headers': dict(response.headers)
+            }
+async def transcribe(_bytes: bytes) -> dict:
+    """
+    Transcribe audio bytes using Modal endpoint
+    Args:
+        _bytes: Audio file bytes
+    Returns:
+        Dictionary containing transcription results
+    """
+    auth_token = os.environ['MODAL_AUTH_TOKEN']
+    response = await _send_bytes_as_form_data(
+        file_bytes=_bytes,
+        endpoint_url='https://yigitsekerci6174--transcribe-audio.modal.run',
+        auth_token=auth_token,
+        field_name='file'
+    )
+    return json.loads(response['response'])

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff