Spaces:
Running
Running
ziqiangao
commited on
Commit
·
1da61b3
1
Parent(s):
a37de4d
corrrect process
Browse files
app.py
CHANGED
@@ -262,7 +262,6 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
262 |
if data.ndim != 2:
|
263 |
raise gr.Error("Expected stereo input")
|
264 |
L, R = data[:, 0], data[:, 1]
|
265 |
-
stereo = np.column_stack([L, R])
|
266 |
|
267 |
# Step 1: LFE from lowpass
|
268 |
p((1, 8), "Processing LFE")
|
@@ -285,19 +284,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
285 |
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
286 |
other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
287 |
|
288 |
-
# Step 3: Reverb using SoX (like regular mode)
|
289 |
-
p((3, 8), "Applying Reverb")
|
290 |
|
291 |
-
#
|
292 |
-
|
293 |
-
|
294 |
-
# Apply reverb to left and right channels separately
|
295 |
-
reverb_L = apply_reverb_wet_only(other_after_crowd[:, 0], fs, reverb_args)
|
296 |
-
reverb_R = apply_reverb_wet_only(other_after_crowd[:, 1], fs, reverb_args)
|
297 |
-
reverb = np.column_stack([reverb_L, reverb_R])
|
298 |
-
|
299 |
-
# Step 4: Speech, music, SFX separation from 'other_after_crowd'
|
300 |
-
p((4, 8), "Separating Speech, Music, and SFX")
|
301 |
demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
302 |
sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
303 |
demucs_input_buf.close()
|
@@ -312,6 +301,14 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
312 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
313 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
# Step 5: Vocal Extraction from music
|
316 |
p((5, 8), "Extracting Vocals")
|
317 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
@@ -338,6 +335,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
338 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
339 |
os.unlink(vl_buf.name)
|
340 |
|
|
|
|
|
|
|
341 |
# Step 7: Mapping and stacking
|
342 |
p((7, 8), "Mapping Channels and Encoding")
|
343 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
@@ -354,14 +354,12 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
354 |
SL = match_len(reverb[:, 0], length)
|
355 |
SR = match_len(reverb[:, 1], length)
|
356 |
|
357 |
-
# Optional: if multi_singer, don’t include backing vocals
|
358 |
if not multi_singer:
|
359 |
SL += match_len(vocals_back[:, 0], length)
|
360 |
SR += match_len(vocals_back[:, 1], length)
|
361 |
SL += match_len(crowd[:, 0], length)
|
362 |
SR += match_len(crowd[:, 1], length)
|
363 |
|
364 |
-
# Final multichannel stack
|
365 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
366 |
|
367 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
|
|
262 |
if data.ndim != 2:
|
263 |
raise gr.Error("Expected stereo input")
|
264 |
L, R = data[:, 0], data[:, 1]
|
|
|
265 |
|
266 |
# Step 1: LFE from lowpass
|
267 |
p((1, 8), "Processing LFE")
|
|
|
284 |
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
285 |
other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
286 |
|
|
|
|
|
287 |
|
288 |
+
# Step 3: Speech, music, SFX separation from 'other_after_crowd'
|
289 |
+
p((3, 8), "Separating Speech, Music, and SFX")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
290 |
demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
291 |
sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
292 |
demucs_input_buf.close()
|
|
|
301 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
302 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
303 |
|
304 |
+
# Step 4: Apply Reverb to the 'music' stem
|
305 |
+
p((4, 8), "Applying Reverb")
|
306 |
+
reverb_args = ['70', '40', '100', '95', '10', '0'] # music preset
|
307 |
+
reverb_L = apply_reverb_wet_only(music[:, 0], fs, reverb_args)
|
308 |
+
reverb_R = apply_reverb_wet_only(music[:, 1], fs, reverb_args)
|
309 |
+
reverb = np.column_stack([reverb_L, reverb_R])
|
310 |
+
|
311 |
+
|
312 |
# Step 5: Vocal Extraction from music
|
313 |
p((5, 8), "Extracting Vocals")
|
314 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
|
|
335 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
336 |
os.unlink(vl_buf.name)
|
337 |
|
338 |
+
# Mix dialog into the centre channel
|
339 |
+
FC_vl += dialog[:, 0] if dialog.ndim == 2 else dialog
|
340 |
+
|
341 |
# Step 7: Mapping and stacking
|
342 |
p((7, 8), "Mapping Channels and Encoding")
|
343 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
|
|
354 |
SL = match_len(reverb[:, 0], length)
|
355 |
SR = match_len(reverb[:, 1], length)
|
356 |
|
|
|
357 |
if not multi_singer:
|
358 |
SL += match_len(vocals_back[:, 0], length)
|
359 |
SR += match_len(vocals_back[:, 1], length)
|
360 |
SL += match_len(crowd[:, 0], length)
|
361 |
SR += match_len(crowd[:, 1], length)
|
362 |
|
|
|
363 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
364 |
|
365 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|