ziqiangao commited on
Commit
938bd0e
·
1 Parent(s): 05d36ea

Add Smart Mode

Browse files
Files changed (1) hide show
  1. app.py +241 -15
app.py CHANGED
@@ -66,7 +66,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
66
  data, fs = sf.read(wav, dtype='float32')
67
  os.unlink(wav)
68
  if data.ndim != 2 or data.shape[1] != 2:
69
- raise ValueError("Input must be stereo 2-channel")
70
  L, R = data[:,0], data[:,1]
71
  M = (L + R) / 2
72
  nperseg = fs
@@ -84,6 +84,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
84
 
85
 
86
  def create_5_1_surround(input_file, preset="music"):
 
87
  # Preset-based parameters
88
  # Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
89
  if preset == "music":
@@ -99,39 +100,40 @@ def create_5_1_surround(input_file, preset="music"):
99
  lfe_cutoff = 120
100
  reverb_args = ['20', '50', '100', '100', '100', '0']
101
  else:
102
- raise ValueError(f"Unknown preset: {preset}")
103
 
104
- # 1. Extract FL/FR/phantom centre
 
105
  fs, FL, FR, FC = extract_phantom_center(input_file)
106
 
107
- # 2. Get stereo original for reverb
108
  wav = convert_to_wav_float(input_file)
109
  stereo, _ = sf.read(wav, dtype='float32')
110
  os.unlink(wav)
111
  L_orig, R_orig = stereo[:, 0], stereo[:, 1]
112
 
113
- # 3. Wet-only reverb with chosen settings
114
  SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
115
  SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
116
 
117
- # 4. Highpass filter everything except LFE
118
  FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
119
  FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
120
  FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
121
  SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
122
  SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
123
 
124
- # 5. Lowpass for LFE
125
  bass_sum = .5 * (L_orig + R_orig)
126
  LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
127
 
128
- # 6. Stack and pad
129
  channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
130
  length = max(len(ch) for ch in channels)
131
  def pad(x): return np.pad(x, (0, length - len(x)))
132
  multich = np.column_stack([pad(ch) for ch in channels])
133
 
134
- # 7. Write WAV and encode to OGG
135
  out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
136
  sf.write(out_wav.name, multich, fs, subtype='FLOAT')
137
  out_wav.close()
@@ -144,6 +146,204 @@ def create_5_1_surround(input_file, preset="music"):
144
  os.unlink(out_wav.name)
145
  return out_ogg.name
146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  # ========== Gradio UI ==========
149
  with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
@@ -151,15 +351,41 @@ with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
151
  gr.Markdown("Choose music or speech preset for surround processing")
152
 
153
  inp = gr.Audio(label="Upload stereo audio", type="filepath")
 
 
 
154
  preset = gr.Dropdown(
155
- label="Select Preset",
156
- choices=["music", "speech", "open"],
157
- value="music" # or whichever you want as the default
158
- )
159
  btn = gr.Button("Convert to 5.1 OGG")
160
  out = gr.File(label="Download 5.1 OGG")
161
 
162
- btn.click(fn=create_5_1_surround, inputs=[inp, preset], outputs=[out])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  if __name__ == "__main__":
165
- demo.launch()
 
66
  data, fs = sf.read(wav, dtype='float32')
67
  os.unlink(wav)
68
  if data.ndim != 2 or data.shape[1] != 2:
69
+ raise gr.Error("Input must be stereo 2-channel")
70
  L, R = data[:,0], data[:,1]
71
  M = (L + R) / 2
72
  nperseg = fs
 
84
 
85
 
86
  def create_5_1_surround(input_file, preset="music"):
87
+ p = gr.Progress()
88
  # Preset-based parameters
89
  # Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
90
  if preset == "music":
 
100
  lfe_cutoff = 120
101
  reverb_args = ['20', '50', '100', '100', '100', '0']
102
  else:
103
+ raise gr.Error(f"Unknown preset: {preset}")
104
 
105
+
106
+ p((1,7),"Extracting Centre")# 1. Extract FL/FR/phantom centre
107
  fs, FL, FR, FC = extract_phantom_center(input_file)
108
 
109
+ p((2,7),"Getting File")# 2. Get stereo original for reverb
110
  wav = convert_to_wav_float(input_file)
111
  stereo, _ = sf.read(wav, dtype='float32')
112
  os.unlink(wav)
113
  L_orig, R_orig = stereo[:, 0], stereo[:, 1]
114
 
115
+ p((3,7),"Reverb For Rear")# 3. Wet-only reverb with chosen settings
116
  SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
117
  SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
118
 
119
+ p((4,7),"Highpassing")# 4. Highpass filter everything except LFE
120
  FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
121
  FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
122
  FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
123
  SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
124
  SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
125
 
126
+ p((5,7),"Extracting LFE")# 5. Lowpass for LFE
127
  bass_sum = .5 * (L_orig + R_orig)
128
  LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
129
 
130
+ p((6,7),"Stacking")# 6. Stack and pad
131
  channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
132
  length = max(len(ch) for ch in channels)
133
  def pad(x): return np.pad(x, (0, length - len(x)))
134
  multich = np.column_stack([pad(ch) for ch in channels])
135
 
136
+ p((7,7),"Encoding")# 7. Write WAV and encode to OGG
137
  out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
138
  sf.write(out_wav.name, multich, fs, subtype='FLOAT')
139
  out_wav.close()
 
146
  os.unlink(out_wav.name)
147
  return out_ogg.name
148
 
149
+ import mimetypes
150
+ import requests
151
+ import time
152
+
153
+ def send_mvsep_audio_job(
154
+ api_token: str,
155
+ audio_bytes: bytes,
156
+ filename: str,
157
+ sep_type: int = 34,
158
+ output_format: int = 2,
159
+ addopt1: str = None,
160
+ addopt2: str = None,
161
+ poll_interval_sec: int = 5
162
+ ):
163
+ """
164
+ Send audio to MVSep for source separation and wait for the result.
165
+
166
+ Args:
167
+ api_token (str): Your API token.
168
+ audio_bytes (bytes): Audio data (any format).
169
+ filename (str): Original filename, used for extension/MIME type.
170
+ sep_type (int): Separation type (e.g., 34 for karaoke).
171
+ output_format (int): Output format (e.g., 2 for FLAC).
172
+ addopt1 (str): Optional extra parameter 1.
173
+ addopt2 (str): Optional extra parameter 2.
174
+ poll_interval_sec (int): How often to check job status.
175
+
176
+ Returns:
177
+ dict: Completed result data from mvsep.com (including file URLs).
178
+ """
179
+ # Step 1: Determine MIME type
180
+ mime_type, _ = mimetypes.guess_type(filename)
181
+ if not mime_type:
182
+ mime_type = "application/octet-stream" # fallback
183
+
184
+ # Step 2: Prepare request
185
+ url = "https://mvsep.com/api/separation/create"
186
+ files = {
187
+ 'audiofile': (filename, audio_bytes, mime_type)
188
+ }
189
+ data = {
190
+ 'api_token': api_token,
191
+ 'sep_type': str(sep_type),
192
+ 'output_format': str(output_format)
193
+ }
194
+ if addopt1:
195
+ data['add_opt1'] = addopt1
196
+ if addopt2:
197
+ data['add_opt2'] = addopt2
198
+
199
+ # Step 3: Send creation request
200
+ response = requests.post(url, files=files, data=data)
201
+ response.raise_for_status()
202
+ json_resp = response.json()
203
+
204
+ if not json_resp.get('success'):
205
+ error_msg = json_resp.get('data', {}).get('message', 'Unknown error')
206
+ raise gr.Error(f"API error: {error_msg}")
207
+
208
+ job_hash = json_resp['data']['hash']
209
+ print(f"Job submitted successfully. Hash: {job_hash}")
210
+
211
+ # Step 4: Poll until job is done
212
+ status_url = "https://mvsep.com/api/separation/get"
213
+ while True:
214
+ poll_resp = requests.get(status_url, params={'hash': job_hash})
215
+ poll_resp.raise_for_status()
216
+ poll_data = poll_resp.json()
217
+
218
+ status = poll_data.get('status')
219
+ print(f"Job status: {status}")
220
+
221
+ if status == 'done':
222
+ return poll_data.get('data', {})
223
+ elif status in ('failed', 'not_found'):
224
+ raise gr.Error(f"Job failed or not found: {poll_data.get('data', {}).get('message', '')}")
225
+
226
+ time.sleep(poll_interval_sec)
227
+
228
+ # Download WAV and preserve sample rate, with optional resampling to target_fs
229
+
230
+ def download_wav(url, target_fs=None):
231
+ r = requests.get(url)
232
+ r.raise_for_status()
233
+ temp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
234
+ temp.write(r.content)
235
+ temp.close()
236
+ audio, sr = sf.read(temp.name, dtype='float32')
237
+ os.unlink(temp.name)
238
+ if target_fs and sr != target_fs:
239
+ # resample if needed
240
+ num_samples = int(len(audio) * target_fs / sr)
241
+ audio = signal.resample(audio, num_samples)
242
+ sr = target_fs
243
+ return audio, sr
244
+
245
+ # Smart mode workflow
246
+ def smart_mode_process(input_file, api_key):
247
+ p = gr.Progress()
248
+ import shutil
249
+
250
+ if not api_key:
251
+ raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
252
+
253
+ # Load original
254
+ wav = convert_to_wav_float(input_file)
255
+ data, fs = sf.read(wav, dtype='float32')
256
+ os.unlink(wav)
257
+ p((0,7), "Loading File")
258
+
259
+ if data.ndim != 2:
260
+ raise gr.Error("Expected stereo input")
261
+ L, R = data[:, 0], data[:, 1]
262
+ stereo = np.column_stack([L, R])
263
+
264
+ # Step 1: LFE from lowpass
265
+ p((1,7), "Processing LFE")
266
+ bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
267
+
268
+ # Step 2: Highpass for MVSep
269
+ p((2,7), "Processing Speech, Music and SFX")
270
+ hp_left = sox_filter(L, fs, 'highpass', 120)
271
+ hp_right = sox_filter(R, fs, 'highpass', 120)
272
+ hp_stereo = np.column_stack([hp_left, hp_right])
273
+ hp_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
274
+ sf.write(hp_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
275
+ hp_buf.close()
276
+
277
+ # Send to MVSep
278
+ demucs_resp = send_mvsep_audio_job(
279
+ api_key, open(hp_buf.name, 'rb').read(), os.path.basename(hp_buf.name), sep_type=24, output_format=2, addopt1=1
280
+ )
281
+ os.unlink(hp_buf.name)
282
+
283
+ dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
284
+ sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
285
+ music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
286
+
287
+ # Step 3: Extract crowd
288
+ p((3,7), "Extracting Crowd")
289
+ music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
290
+ sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
291
+ music_buf.close()
292
+ crowd_resp = send_mvsep_audio_job(
293
+ api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
294
+ )
295
+ os.unlink(music_buf.name)
296
+ crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
297
+ other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
298
+
299
+ # Step 4: Extract vocals
300
+ p((4,7), "Extracting Vocals")
301
+ other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
302
+ sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
303
+ other_buf.close()
304
+ karaoke_resp = send_mvsep_audio_job(
305
+ api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name), sep_type=49, output_format=2, addopt1=3, addopt2=1
306
+ )
307
+ os.unlink(other_buf.name)
308
+ vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
309
+ vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
310
+ instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
311
+
312
+ # Step 5: Phantom center for lead vocals
313
+ p((5,7), "Distributing Front Vocal Channels")
314
+ vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
315
+ sf.write(vl_buf.name, vocals_lead, fs, subtype='FLOAT')
316
+ vl_buf.close()
317
+ _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
318
+ os.unlink(vl_buf.name)
319
+
320
+ # Step 6: Map channels and pad
321
+ p((6,7), "Mapping Channels")
322
+ def match_len(x, length): return np.pad(x, (0, length - len(x)))
323
+ lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx.shape[0], crowd.shape[0], vocals_back.shape[0], instr.shape[0]]
324
+ length = max(lens)
325
+
326
+ out_L = match_len(FL_vl, length) + match_len(instr[:,0], length)
327
+ out_R = match_len(FR_vl, length) + match_len(instr[:,1], length)
328
+ out_C = match_len(FC_vl, length)
329
+ out_LFE = match_len(bass, length)
330
+ SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
331
+ SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
332
+
333
+ # Step 7: Encode to 5.1 OGG
334
+ p((7,7), "Processing Step 7, Encoding")
335
+ multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
336
+ out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
337
+ sf.write(out_wav.name, multich, fs, subtype='FLOAT')
338
+ out_wav.close()
339
+ out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
340
+ subprocess.run([
341
+ "ffmpeg", "-y", "-i", out_wav.name,
342
+ "-c:a", "libvorbis", "-ac", "6", "-channel_layout", "5.1", out_ogg.name
343
+ ], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
344
+ os.unlink(out_wav.name)
345
+
346
+ return out_ogg.name
347
 
348
  # ========== Gradio UI ==========
349
  with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
 
351
  gr.Markdown("Choose music or speech preset for surround processing")
352
 
353
  inp = gr.Audio(label="Upload stereo audio", type="filepath")
354
+ smart_mode = gr.Checkbox(label="Enable Smart Mode", value=False)
355
+
356
+ # Normal mode elements
357
  preset = gr.Dropdown(
358
+ label="Select Preset",
359
+ choices=["music", "speech", "open"],
360
+ value="music"
361
+ )
362
  btn = gr.Button("Convert to 5.1 OGG")
363
  out = gr.File(label="Download 5.1 OGG")
364
 
365
+ # Smart mode section
366
+ with gr.Column(visible=False) as smart_section:
367
+ api_key = gr.Textbox(label="MVSep API Key", type="password")
368
+ smart_btn = gr.Button("Start")
369
+ smart_out = gr.File(label="Output from Smart Mode")
370
+
371
+ # Logic for toggling sections
372
+ def toggle_mode(enabled):
373
+ return (
374
+ gr.update(visible=not enabled), # preset
375
+ gr.update(visible=not enabled), # btn
376
+ gr.update(visible=not enabled), # out
377
+ gr.update(visible=enabled) # smart_section
378
+ )
379
+
380
+ smart_mode.change(
381
+ fn=toggle_mode,
382
+ inputs=[smart_mode],
383
+ outputs=[preset, btn, out, smart_section]
384
+ )
385
+
386
+ # Button functions
387
+ btn.click(fn=create_5_1_surround, inputs=[inp, preset], outputs=[out], concurrency_limit=10)
388
+ smart_btn.click(fn=smart_mode_process, inputs=[inp, api_key], outputs=[smart_out], concurrency_limit=20)
389
 
390
  if __name__ == "__main__":
391
+ demo.launch(show_error=True)