Spaces:
Running
Running
ziqiangao
commited on
Commit
·
938bd0e
1
Parent(s):
05d36ea
Add Smart Mode
Browse files
app.py
CHANGED
@@ -66,7 +66,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
|
|
66 |
data, fs = sf.read(wav, dtype='float32')
|
67 |
os.unlink(wav)
|
68 |
if data.ndim != 2 or data.shape[1] != 2:
|
69 |
-
raise
|
70 |
L, R = data[:,0], data[:,1]
|
71 |
M = (L + R) / 2
|
72 |
nperseg = fs
|
@@ -84,6 +84,7 @@ def extract_phantom_center(input_file, rdf=0.99999):
|
|
84 |
|
85 |
|
86 |
def create_5_1_surround(input_file, preset="music"):
|
|
|
87 |
# Preset-based parameters
|
88 |
# Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
|
89 |
if preset == "music":
|
@@ -99,39 +100,40 @@ def create_5_1_surround(input_file, preset="music"):
|
|
99 |
lfe_cutoff = 120
|
100 |
reverb_args = ['20', '50', '100', '100', '100', '0']
|
101 |
else:
|
102 |
-
raise
|
103 |
|
104 |
-
|
|
|
105 |
fs, FL, FR, FC = extract_phantom_center(input_file)
|
106 |
|
107 |
-
# 2. Get stereo original for reverb
|
108 |
wav = convert_to_wav_float(input_file)
|
109 |
stereo, _ = sf.read(wav, dtype='float32')
|
110 |
os.unlink(wav)
|
111 |
L_orig, R_orig = stereo[:, 0], stereo[:, 1]
|
112 |
|
113 |
-
# 3. Wet-only reverb with chosen settings
|
114 |
SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
|
115 |
SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
|
116 |
|
117 |
-
# 4. Highpass filter everything except LFE
|
118 |
FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
|
119 |
FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
|
120 |
FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
|
121 |
SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
|
122 |
SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
|
123 |
|
124 |
-
# 5. Lowpass for LFE
|
125 |
bass_sum = .5 * (L_orig + R_orig)
|
126 |
LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
|
127 |
|
128 |
-
# 6. Stack and pad
|
129 |
channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
|
130 |
length = max(len(ch) for ch in channels)
|
131 |
def pad(x): return np.pad(x, (0, length - len(x)))
|
132 |
multich = np.column_stack([pad(ch) for ch in channels])
|
133 |
|
134 |
-
# 7. Write WAV and encode to OGG
|
135 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
136 |
sf.write(out_wav.name, multich, fs, subtype='FLOAT')
|
137 |
out_wav.close()
|
@@ -144,6 +146,204 @@ def create_5_1_surround(input_file, preset="music"):
|
|
144 |
os.unlink(out_wav.name)
|
145 |
return out_ogg.name
|
146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
# ========== Gradio UI ==========
|
149 |
with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
|
@@ -151,15 +351,41 @@ with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
|
|
151 |
gr.Markdown("Choose music or speech preset for surround processing")
|
152 |
|
153 |
inp = gr.Audio(label="Upload stereo audio", type="filepath")
|
|
|
|
|
|
|
154 |
preset = gr.Dropdown(
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
)
|
159 |
btn = gr.Button("Convert to 5.1 OGG")
|
160 |
out = gr.File(label="Download 5.1 OGG")
|
161 |
|
162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
if __name__ == "__main__":
|
165 |
-
demo.launch()
|
|
|
66 |
data, fs = sf.read(wav, dtype='float32')
|
67 |
os.unlink(wav)
|
68 |
if data.ndim != 2 or data.shape[1] != 2:
|
69 |
+
raise gr.Error("Input must be stereo 2-channel")
|
70 |
L, R = data[:,0], data[:,1]
|
71 |
M = (L + R) / 2
|
72 |
nperseg = fs
|
|
|
84 |
|
85 |
|
86 |
def create_5_1_surround(input_file, preset="music"):
|
87 |
+
p = gr.Progress()
|
88 |
# Preset-based parameters
|
89 |
# Reverberance (50%) HF-damping (50%) room-scale (100%) stereo-depth (100%) pre-delay (0ms) wet-gain (0dB)
|
90 |
if preset == "music":
|
|
|
100 |
lfe_cutoff = 120
|
101 |
reverb_args = ['20', '50', '100', '100', '100', '0']
|
102 |
else:
|
103 |
+
raise gr.Error(f"Unknown preset: {preset}")
|
104 |
|
105 |
+
|
106 |
+
p((1,7),"Extracting Centre")# 1. Extract FL/FR/phantom centre
|
107 |
fs, FL, FR, FC = extract_phantom_center(input_file)
|
108 |
|
109 |
+
p((2,7),"Getting File")# 2. Get stereo original for reverb
|
110 |
wav = convert_to_wav_float(input_file)
|
111 |
stereo, _ = sf.read(wav, dtype='float32')
|
112 |
os.unlink(wav)
|
113 |
L_orig, R_orig = stereo[:, 0], stereo[:, 1]
|
114 |
|
115 |
+
p((3,7),"Reverb For Rear")# 3. Wet-only reverb with chosen settings
|
116 |
SL = apply_reverb_wet_only(L_orig, fs, reverb_args)
|
117 |
SR = apply_reverb_wet_only(R_orig, fs, reverb_args)
|
118 |
|
119 |
+
p((4,7),"Highpassing")# 4. Highpass filter everything except LFE
|
120 |
FL_hp = sox_filter(FL, fs, 'highpass', hp_cutoff)
|
121 |
FR_hp = sox_filter(FR, fs, 'highpass', hp_cutoff)
|
122 |
FC_hp = sox_filter(FC, fs, 'highpass', hp_cutoff)
|
123 |
SL_hp = sox_filter(SL, fs, 'highpass', hp_cutoff)
|
124 |
SR_hp = sox_filter(SR, fs, 'highpass', hp_cutoff)
|
125 |
|
126 |
+
p((5,7),"Extracting LFE")# 5. Lowpass for LFE
|
127 |
bass_sum = .5 * (L_orig + R_orig)
|
128 |
LFE = sox_filter(bass_sum, fs, 'lowpass', lfe_cutoff)
|
129 |
|
130 |
+
p((6,7),"Stacking")# 6. Stack and pad
|
131 |
channels = [FL_hp, FR_hp, FC_hp, LFE, SL_hp, SR_hp]
|
132 |
length = max(len(ch) for ch in channels)
|
133 |
def pad(x): return np.pad(x, (0, length - len(x)))
|
134 |
multich = np.column_stack([pad(ch) for ch in channels])
|
135 |
|
136 |
+
p((7,7),"Encoding")# 7. Write WAV and encode to OGG
|
137 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
138 |
sf.write(out_wav.name, multich, fs, subtype='FLOAT')
|
139 |
out_wav.close()
|
|
|
146 |
os.unlink(out_wav.name)
|
147 |
return out_ogg.name
|
148 |
|
149 |
+
import mimetypes
|
150 |
+
import requests
|
151 |
+
import time
|
152 |
+
|
153 |
+
def send_mvsep_audio_job(
|
154 |
+
api_token: str,
|
155 |
+
audio_bytes: bytes,
|
156 |
+
filename: str,
|
157 |
+
sep_type: int = 34,
|
158 |
+
output_format: int = 2,
|
159 |
+
addopt1: str = None,
|
160 |
+
addopt2: str = None,
|
161 |
+
poll_interval_sec: int = 5
|
162 |
+
):
|
163 |
+
"""
|
164 |
+
Send audio to MVSep for source separation and wait for the result.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
api_token (str): Your API token.
|
168 |
+
audio_bytes (bytes): Audio data (any format).
|
169 |
+
filename (str): Original filename, used for extension/MIME type.
|
170 |
+
sep_type (int): Separation type (e.g., 34 for karaoke).
|
171 |
+
output_format (int): Output format (e.g., 2 for FLAC).
|
172 |
+
addopt1 (str): Optional extra parameter 1.
|
173 |
+
addopt2 (str): Optional extra parameter 2.
|
174 |
+
poll_interval_sec (int): How often to check job status.
|
175 |
+
|
176 |
+
Returns:
|
177 |
+
dict: Completed result data from mvsep.com (including file URLs).
|
178 |
+
"""
|
179 |
+
# Step 1: Determine MIME type
|
180 |
+
mime_type, _ = mimetypes.guess_type(filename)
|
181 |
+
if not mime_type:
|
182 |
+
mime_type = "application/octet-stream" # fallback
|
183 |
+
|
184 |
+
# Step 2: Prepare request
|
185 |
+
url = "https://mvsep.com/api/separation/create"
|
186 |
+
files = {
|
187 |
+
'audiofile': (filename, audio_bytes, mime_type)
|
188 |
+
}
|
189 |
+
data = {
|
190 |
+
'api_token': api_token,
|
191 |
+
'sep_type': str(sep_type),
|
192 |
+
'output_format': str(output_format)
|
193 |
+
}
|
194 |
+
if addopt1:
|
195 |
+
data['add_opt1'] = addopt1
|
196 |
+
if addopt2:
|
197 |
+
data['add_opt2'] = addopt2
|
198 |
+
|
199 |
+
# Step 3: Send creation request
|
200 |
+
response = requests.post(url, files=files, data=data)
|
201 |
+
response.raise_for_status()
|
202 |
+
json_resp = response.json()
|
203 |
+
|
204 |
+
if not json_resp.get('success'):
|
205 |
+
error_msg = json_resp.get('data', {}).get('message', 'Unknown error')
|
206 |
+
raise gr.Error(f"API error: {error_msg}")
|
207 |
+
|
208 |
+
job_hash = json_resp['data']['hash']
|
209 |
+
print(f"Job submitted successfully. Hash: {job_hash}")
|
210 |
+
|
211 |
+
# Step 4: Poll until job is done
|
212 |
+
status_url = "https://mvsep.com/api/separation/get"
|
213 |
+
while True:
|
214 |
+
poll_resp = requests.get(status_url, params={'hash': job_hash})
|
215 |
+
poll_resp.raise_for_status()
|
216 |
+
poll_data = poll_resp.json()
|
217 |
+
|
218 |
+
status = poll_data.get('status')
|
219 |
+
print(f"Job status: {status}")
|
220 |
+
|
221 |
+
if status == 'done':
|
222 |
+
return poll_data.get('data', {})
|
223 |
+
elif status in ('failed', 'not_found'):
|
224 |
+
raise gr.Error(f"Job failed or not found: {poll_data.get('data', {}).get('message', '')}")
|
225 |
+
|
226 |
+
time.sleep(poll_interval_sec)
|
227 |
+
|
228 |
+
# Download WAV and preserve sample rate, with optional resampling to target_fs
|
229 |
+
|
230 |
+
def download_wav(url, target_fs=None):
|
231 |
+
r = requests.get(url)
|
232 |
+
r.raise_for_status()
|
233 |
+
temp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
234 |
+
temp.write(r.content)
|
235 |
+
temp.close()
|
236 |
+
audio, sr = sf.read(temp.name, dtype='float32')
|
237 |
+
os.unlink(temp.name)
|
238 |
+
if target_fs and sr != target_fs:
|
239 |
+
# resample if needed
|
240 |
+
num_samples = int(len(audio) * target_fs / sr)
|
241 |
+
audio = signal.resample(audio, num_samples)
|
242 |
+
sr = target_fs
|
243 |
+
return audio, sr
|
244 |
+
|
245 |
+
# Smart mode workflow
|
246 |
+
def smart_mode_process(input_file, api_key):
|
247 |
+
p = gr.Progress()
|
248 |
+
import shutil
|
249 |
+
|
250 |
+
if not api_key:
|
251 |
+
raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
|
252 |
+
|
253 |
+
# Load original
|
254 |
+
wav = convert_to_wav_float(input_file)
|
255 |
+
data, fs = sf.read(wav, dtype='float32')
|
256 |
+
os.unlink(wav)
|
257 |
+
p((0,7), "Loading File")
|
258 |
+
|
259 |
+
if data.ndim != 2:
|
260 |
+
raise gr.Error("Expected stereo input")
|
261 |
+
L, R = data[:, 0], data[:, 1]
|
262 |
+
stereo = np.column_stack([L, R])
|
263 |
+
|
264 |
+
# Step 1: LFE from lowpass
|
265 |
+
p((1,7), "Processing LFE")
|
266 |
+
bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
|
267 |
+
|
268 |
+
# Step 2: Highpass for MVSep
|
269 |
+
p((2,7), "Processing Speech, Music and SFX")
|
270 |
+
hp_left = sox_filter(L, fs, 'highpass', 120)
|
271 |
+
hp_right = sox_filter(R, fs, 'highpass', 120)
|
272 |
+
hp_stereo = np.column_stack([hp_left, hp_right])
|
273 |
+
hp_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
274 |
+
sf.write(hp_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
|
275 |
+
hp_buf.close()
|
276 |
+
|
277 |
+
# Send to MVSep
|
278 |
+
demucs_resp = send_mvsep_audio_job(
|
279 |
+
api_key, open(hp_buf.name, 'rb').read(), os.path.basename(hp_buf.name), sep_type=24, output_format=2, addopt1=1
|
280 |
+
)
|
281 |
+
os.unlink(hp_buf.name)
|
282 |
+
|
283 |
+
dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
|
284 |
+
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
285 |
+
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
286 |
+
|
287 |
+
# Step 3: Extract crowd
|
288 |
+
p((3,7), "Extracting Crowd")
|
289 |
+
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
290 |
+
sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
|
291 |
+
music_buf.close()
|
292 |
+
crowd_resp = send_mvsep_audio_job(
|
293 |
+
api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
|
294 |
+
)
|
295 |
+
os.unlink(music_buf.name)
|
296 |
+
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
297 |
+
other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
298 |
+
|
299 |
+
# Step 4: Extract vocals
|
300 |
+
p((4,7), "Extracting Vocals")
|
301 |
+
other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
302 |
+
sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
|
303 |
+
other_buf.close()
|
304 |
+
karaoke_resp = send_mvsep_audio_job(
|
305 |
+
api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name), sep_type=49, output_format=2, addopt1=3, addopt2=1
|
306 |
+
)
|
307 |
+
os.unlink(other_buf.name)
|
308 |
+
vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
|
309 |
+
vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
|
310 |
+
instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
|
311 |
+
|
312 |
+
# Step 5: Phantom center for lead vocals
|
313 |
+
p((5,7), "Distributing Front Vocal Channels")
|
314 |
+
vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
315 |
+
sf.write(vl_buf.name, vocals_lead, fs, subtype='FLOAT')
|
316 |
+
vl_buf.close()
|
317 |
+
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
318 |
+
os.unlink(vl_buf.name)
|
319 |
+
|
320 |
+
# Step 6: Map channels and pad
|
321 |
+
p((6,7), "Mapping Channels")
|
322 |
+
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
323 |
+
lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx.shape[0], crowd.shape[0], vocals_back.shape[0], instr.shape[0]]
|
324 |
+
length = max(lens)
|
325 |
+
|
326 |
+
out_L = match_len(FL_vl, length) + match_len(instr[:,0], length)
|
327 |
+
out_R = match_len(FR_vl, length) + match_len(instr[:,1], length)
|
328 |
+
out_C = match_len(FC_vl, length)
|
329 |
+
out_LFE = match_len(bass, length)
|
330 |
+
SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
|
331 |
+
SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
|
332 |
+
|
333 |
+
# Step 7: Encode to 5.1 OGG
|
334 |
+
p((7,7), "Processing Step 7, Encoding")
|
335 |
+
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
336 |
+
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
337 |
+
sf.write(out_wav.name, multich, fs, subtype='FLOAT')
|
338 |
+
out_wav.close()
|
339 |
+
out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
|
340 |
+
subprocess.run([
|
341 |
+
"ffmpeg", "-y", "-i", out_wav.name,
|
342 |
+
"-c:a", "libvorbis", "-ac", "6", "-channel_layout", "5.1", out_ogg.name
|
343 |
+
], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True)
|
344 |
+
os.unlink(out_wav.name)
|
345 |
+
|
346 |
+
return out_ogg.name
|
347 |
|
348 |
# ========== Gradio UI ==========
|
349 |
with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
|
|
|
351 |
gr.Markdown("Choose music or speech preset for surround processing")
|
352 |
|
353 |
inp = gr.Audio(label="Upload stereo audio", type="filepath")
|
354 |
+
smart_mode = gr.Checkbox(label="Enable Smart Mode", value=False)
|
355 |
+
|
356 |
+
# Normal mode elements
|
357 |
preset = gr.Dropdown(
|
358 |
+
label="Select Preset",
|
359 |
+
choices=["music", "speech", "open"],
|
360 |
+
value="music"
|
361 |
+
)
|
362 |
btn = gr.Button("Convert to 5.1 OGG")
|
363 |
out = gr.File(label="Download 5.1 OGG")
|
364 |
|
365 |
+
# Smart mode section
|
366 |
+
with gr.Column(visible=False) as smart_section:
|
367 |
+
api_key = gr.Textbox(label="MVSep API Key", type="password")
|
368 |
+
smart_btn = gr.Button("Start")
|
369 |
+
smart_out = gr.File(label="Output from Smart Mode")
|
370 |
+
|
371 |
+
# Logic for toggling sections
|
372 |
+
def toggle_mode(enabled):
|
373 |
+
return (
|
374 |
+
gr.update(visible=not enabled), # preset
|
375 |
+
gr.update(visible=not enabled), # btn
|
376 |
+
gr.update(visible=not enabled), # out
|
377 |
+
gr.update(visible=enabled) # smart_section
|
378 |
+
)
|
379 |
+
|
380 |
+
smart_mode.change(
|
381 |
+
fn=toggle_mode,
|
382 |
+
inputs=[smart_mode],
|
383 |
+
outputs=[preset, btn, out, smart_section]
|
384 |
+
)
|
385 |
+
|
386 |
+
# Button functions
|
387 |
+
btn.click(fn=create_5_1_surround, inputs=[inp, preset], outputs=[out], concurrency_limit=10)
|
388 |
+
smart_btn.click(fn=smart_mode_process, inputs=[inp, api_key], outputs=[smart_out], concurrency_limit=20)
|
389 |
|
390 |
if __name__ == "__main__":
|
391 |
+
demo.launch(show_error=True)
|