Spaces:

rc19477
/

dev_only_useless

Sleeping

App Files Files Community

roychao19477 commited on Jul 16

Commit

27bac1c

1 Parent(s): d3458da

Update chunk feature

Browse files

Files changed (1) hide show

app.py +37 -5

app.py CHANGED Viewed

@@ -103,13 +103,45 @@ def run_avse_inference(video_path, audio_path):
     # Combine into input dict (match what model.enhance expects)
-    data = {
-        "noisy_audio": noisy,
-        "video_frames": bg_frames[np.newaxis, ...]
-    }
     with torch.no_grad():
-        estimated = avse_model.enhance(data).reshape(-1)
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")

     # Combine into input dict (match what model.enhance expects)
+    #data = {
+    #    "noisy_audio": noisy,
+    #    "video_frames": bg_frames[np.newaxis, ...]
+    #}
     with torch.no_grad():
+        # Version 1
+        #estimated = avse_model.enhance(data).reshape(-1)
+        # Version 2
+        chunk_sec = 3
+        sr = 16000
+        audio_chunk_len = chunk_sec * sr  # 48000
+        video_chunk_len = chunk_sec * 25  # 75
+        estimated_chunks = []
+        for i in range(0, len(noisy), audio_chunk_len):
+            audio_chunk = noisy[i:i+audio_chunk_len]
+            if len(audio_chunk) < audio_chunk_len:
+                pad = np.zeros(audio_chunk_len - len(audio_chunk), dtype=audio_chunk.dtype)
+                audio_chunk = np.concatenate([audio_chunk, pad])
+            vid_idx = i // sr * 25  # convert audio index to video frame index
+            video_chunk = bg_frames[0, vid_idx:vid_idx+video_chunk_len, :, :]
+            if video_chunk.shape[0] < video_chunk_len:
+                pad = np.zeros((video_chunk_len - video_chunk.shape[0], *video_chunk.shape[1:]), dtype=video_chunk.dtype)
+                video_chunk = np.concatenate([video_chunk, pad], axis=0)
+            data = {
+                "noisy_audio": audio_chunk,
+                "video_frames": video_chunk[np.newaxis, ...]
+            }
+            with torch.no_grad():
+                out = avse_model.enhance(data).reshape(-1)
+                estimated_chunks.append(out)
+        estimated = np.concatenate(estimated_chunks)[:len(noisy)]
     # Save result
     tmp_wav = audio_path.replace(".wav", "_enhanced.wav")