Spaces:

ziqiangao
/

surroundify

Running

App Files Files Community

ziqiangao commited on 9 days ago

Commit

938bd0e

1 Parent(s): 05d36ea

Add Smart Mode

Browse files

Files changed (1) hide show

app.py +241 -15

app.py CHANGED Viewed

@@ -66,7 +66,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
     data, fs = sf.read(wav, dtype='float32')
     os.unlink(wav)
     if data.ndim != 2 or data.shape[1] != 2:
-        raise ValueError("Input must be stereo 2-channel")
     L, R = data[:,0], data[:,1]
     M = (L + R) / 2
     nperseg = fs
@@ -84,6 +84,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
 def create_5_1_surround(input_file, preset="music"):
     # Preset-based parameters
     # Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
     if preset == "music":
@@ -99,39 +100,40 @@ def create_5_1_surround(input_file, preset="music"):
         lfe_cutoff = 120
         reverb_args = ['20', '50', '100', '100', '100', '0']
     else:
-        raise ValueError(f"Unknown preset: {preset}")
-    # 1. Extract FL/FR/phantom centre
     fs, FL, FR, FC = extract_phantom_center(input_file)
-    # 2. Get stereo original for reverb
     wav = convert_to_wav_float(input_file)
     stereo, _ = sf.read(wav, dtype='float32')
     os.unlink(wav)
     L_orig, R_orig = stereo[:, 0], stereo[:, 1]
-    # 3. Wet-only reverb with chosen settings
     SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
     SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
-    # 4. Highpass filter everything except LFE
     FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
     FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
     FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
     SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
     SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
-    # 5. Lowpass for LFE
     bass_sum = .5 * (L_orig + R_orig)
     LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
-    # 6. Stack and pad
     channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
     length = max(len(ch) for ch in channels)
     def pad(x): return np.pad(x, (0, length - len(x)))
     multich = np.column_stack([pad(ch) for ch in channels])
-    # 7. Write WAV and encode to OGG
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     sf.write(out_wav.name, multich, fs, subtype='FLOAT')
     out_wav.close()
@@ -144,6 +146,204 @@ def create_5_1_surround(input_file, preset="music"):
     os.unlink(out_wav.name)
     return out_ogg.name
 # ========== Gradio UI ==========
 with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
@@ -151,15 +351,41 @@ with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
     gr.Markdown("Choose music or speech preset for surround processing")
     inp = gr.Audio(label="Upload stereo audio", type="filepath")
     preset = gr.Dropdown(
-    label="Select Preset",
-    choices=["music", "speech", "open"],
-    value="music"  # or whichever you want as the default
-)
     btn = gr.Button("Convert to 5.1 OGG")
     out = gr.File(label="Download 5.1 OGG")
-    btn.click(fn=create_5_1_surround, inputs=[inp, preset], outputs=[out])
 if __name__ == "__main__":
-    demo.launch()

     data, fs = sf.read(wav, dtype='float32')
     os.unlink(wav)
     if data.ndim != 2 or data.shape[1] != 2:
+        raise gr.Error("Input must be stereo 2-channel")
     L, R = data[:,0], data[:,1]
     M = (L + R) / 2
     nperseg = fs
 def create_5_1_surround(input_file, preset="music"):
+    p = gr.Progress()
     # Preset-based parameters
     # Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
     if preset == "music":
         lfe_cutoff = 120
         reverb_args = ['20', '50', '100', '100', '100', '0']
     else:
+        raise gr.Error(f"Unknown preset: {preset}")
+    p((1,7),"Extracting Centre")# 1. Extract FL/FR/phantom centre
     fs, FL, FR, FC = extract_phantom_center(input_file)
+    p((2,7),"Getting File")# 2. Get stereo original for reverb
     wav = convert_to_wav_float(input_file)
     stereo, _ = sf.read(wav, dtype='float32')
     os.unlink(wav)
     L_orig, R_orig = stereo[:, 0], stereo[:, 1]
+    p((3,7),"Reverb For Rear")# 3. Wet-only reverb with chosen settings
     SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
     SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
+    p((4,7),"Highpassing")# 4. Highpass filter everything except LFE
     FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
     FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
     FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
     SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
     SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
+    p((5,7),"Extracting LFE")# 5. Lowpass for LFE
     bass_sum = .5 * (L_orig + R_orig)
     LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
+    p((6,7),"Stacking")# 6. Stack and pad
     channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
     length = max(len(ch) for ch in channels)
     def pad(x): return np.pad(x, (0, length - len(x)))
     multich = np.column_stack([pad(ch) for ch in channels])
+    p((7,7),"Encoding")# 7. Write WAV and encode to OGG
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     sf.write(out_wav.name, multich, fs, subtype='FLOAT')
     out_wav.close()
     os.unlink(out_wav.name)
     return out_ogg.name
+import mimetypes
+import requests
+import time
+def send_mvsep_audio_job(
+    api_token: str,
+    audio_bytes: bytes,
+    filename: str,
+    sep_type: int = 34,
+    output_format: int = 2,
+    addopt1: str = None,
+    addopt2: str = None,
+    poll_interval_sec: int = 5
+):
+    """
+    Send audio to MVSep for source separation and wait for the result.
+    Args:
+        api_token (str): Your API token.
+        audio_bytes (bytes): Audio data (any format).
+        filename (str): Original filename, used for extension/MIME type.
+        sep_type (int): Separation type (e.g., 34 for karaoke).
+        output_format (int): Output format (e.g., 2 for FLAC).
+        addopt1 (str): Optional extra parameter 1.
+        addopt2 (str): Optional extra parameter 2.
+        poll_interval_sec (int): How often to check job status.
+    Returns:
+        dict: Completed result data from mvsep.com (including file URLs).
+    """
+    # Step 1: Determine MIME type
+    mime_type, _ = mimetypes.guess_type(filename)
+    if not mime_type:
+        mime_type = "application/octet-stream"  # fallback
+    # Step 2: Prepare request
+    url = "https://mvsep.com/api/separation/create"
+    files = {
+        'audiofile': (filename, audio_bytes, mime_type)
+    }
+    data = {
+        'api_token': api_token,
+        'sep_type': str(sep_type),
+        'output_format': str(output_format)
+    }
+    if addopt1:
+        data['add_opt1'] = addopt1
+    if addopt2:
+        data['add_opt2'] = addopt2
+    # Step 3: Send creation request
+    response = requests.post(url, files=files, data=data)
+    response.raise_for_status()
+    json_resp = response.json()
+    if not json_resp.get('success'):
+        error_msg = json_resp.get('data', {}).get('message', 'Unknown error')
+        raise gr.Error(f"API error: {error_msg}")
+    job_hash = json_resp['data']['hash']
+    print(f"Job submitted successfully. Hash: {job_hash}")
+    # Step 4: Poll until job is done
+    status_url = "https://mvsep.com/api/separation/get"
+    while True:
+        poll_resp = requests.get(status_url, params={'hash': job_hash})
+        poll_resp.raise_for_status()
+        poll_data = poll_resp.json()
+        status = poll_data.get('status')
+        print(f"Job status: {status}")
+        if status == 'done':
+            return poll_data.get('data', {})
+        elif status in ('failed', 'not_found'):
+            raise gr.Error(f"Job failed or not found: {poll_data.get('data', {}).get('message', '')}")
+        time.sleep(poll_interval_sec)
+# Download WAV and preserve sample rate, with optional resampling to target_fs
+def download_wav(url, target_fs=None):
+    r = requests.get(url)
+    r.raise_for_status()
+    temp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    temp.write(r.content)
+    temp.close()
+    audio, sr = sf.read(temp.name, dtype='float32')
+    os.unlink(temp.name)
+    if target_fs and sr != target_fs:
+        # resample if needed
+        num_samples = int(len(audio) * target_fs / sr)
+        audio = signal.resample(audio, num_samples)
+        sr = target_fs
+    return audio, sr
+# Smart mode workflow
+def smart_mode_process(input_file, api_key):
+    p = gr.Progress()
+    import shutil
+    if not api_key:
+        raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
+    # Load original
+    wav = convert_to_wav_float(input_file)
+    data, fs = sf.read(wav, dtype='float32')
+    os.unlink(wav)
+    p((0,7), "Loading File")
+    if data.ndim != 2:
+        raise gr.Error("Expected stereo input")
+    L, R = data[:, 0], data[:, 1]
+    stereo = np.column_stack([L, R])
+    # Step 1: LFE from lowpass
+    p((1,7), "Processing LFE")
+    bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
+    # Step 2: Highpass for MVSep
+    p((2,7), "Processing Speech, Music and SFX")
+    hp_left = sox_filter(L, fs, 'highpass', 120)
+    hp_right = sox_filter(R, fs, 'highpass', 120)
+    hp_stereo = np.column_stack([hp_left, hp_right])
+    hp_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(hp_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
+    hp_buf.close()
+    # Send to MVSep
+    demucs_resp = send_mvsep_audio_job(
+        api_key, open(hp_buf.name, 'rb').read(), os.path.basename(hp_buf.name), sep_type=24, output_format=2, addopt1=1
+    )
+    os.unlink(hp_buf.name)
+    dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
+    sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
+    music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
+    # Step 3: Extract crowd
+    p((3,7), "Extracting Crowd")
+    music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
+    music_buf.close()
+    crowd_resp = send_mvsep_audio_job(
+        api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
+    )
+    os.unlink(music_buf.name)
+    crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
+    other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
+    # Step 4: Extract vocals
+    p((4,7), "Extracting Vocals")
+    other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
+    other_buf.close()
+    karaoke_resp = send_mvsep_audio_job(
+        api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name), sep_type=49, output_format=2, addopt1=3, addopt2=1
+    )
+    os.unlink(other_buf.name)
+    vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
+    vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
+    instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
+    # Step 5: Phantom center for lead vocals
+    p((5,7), "Distributing Front Vocal Channels")
+    vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    sf.write(vl_buf.name, vocals_lead, fs, subtype='FLOAT')
+    vl_buf.close()
+    _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
+    os.unlink(vl_buf.name)
+    # Step 6: Map channels and pad
+    p((6,7), "Mapping Channels")
+    def match_len(x, length): return np.pad(x, (0, length - len(x)))
+    lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx.shape[0], crowd.shape[0], vocals_back.shape[0], instr.shape[0]]
+    length = max(lens)
+    out_L = match_len(FL_vl, length) + match_len(instr[:,0], length)
+    out_R = match_len(FR_vl, length) + match_len(instr[:,1], length)
+    out_C = match_len(FC_vl, length)
+    out_LFE = match_len(bass, length)
+    SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
+    SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
+    # Step 7: Encode to 5.1 OGG
+    p((7,7), "Processing Step 7, Encoding")
+    multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
+    out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
+    sf.write(out_wav.name, multich, fs, subtype='FLOAT')
+    out_wav.close()
+    out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
+    subprocess.run([
+        "ffmpeg", "-y", "-i", out_wav.name,
+        "-c:a", "libvorbis", "-ac", "6", "-channel_layout", "5.1", out_ogg.name
+    ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
+    os.unlink(out_wav.name)
+    return out_ogg.name
 # ========== Gradio UI ==========
 with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
     gr.Markdown("Choose music or speech preset for surround processing")
     inp = gr.Audio(label="Upload stereo audio", type="filepath")
+    smart_mode = gr.Checkbox(label="Enable Smart Mode", value=False)
+    # Normal mode elements
     preset = gr.Dropdown(
+        label="Select Preset",
+        choices=["music", "speech", "open"],
+        value="music"
+    )
     btn = gr.Button("Convert to 5.1 OGG")
     out = gr.File(label="Download 5.1 OGG")
+    # Smart mode section
+    with gr.Column(visible=False) as smart_section:
+        api_key = gr.Textbox(label="MVSep API Key", type="password")
+        smart_btn = gr.Button("Start")
+        smart_out = gr.File(label="Output from Smart Mode")
+    # Logic for toggling sections
+    def toggle_mode(enabled):
+        return (
+            gr.update(visible=not enabled),  # preset
+            gr.update(visible=not enabled),  # btn
+            gr.update(visible=not enabled),  # out
+            gr.update(visible=enabled)       # smart_section
+        )
+    smart_mode.change(
+        fn=toggle_mode,
+        inputs=[smart_mode],
+        outputs=[preset, btn, out, smart_section]
+    )
+    # Button functions
+    btn.click(fn=create_5_1_surround, inputs=[inp, preset], outputs=[out], concurrency_limit=10)
+    smart_btn.click(fn=smart_mode_process, inputs=[inp, api_key], outputs=[smart_out], concurrency_limit=20)
 if __name__ == "__main__":
+    demo.launch(show_error=True)