Spaces:

rc19477
/

dev_only_useless

Sleeping

App Files Files Community

roychao19477 commited on 19 days ago

Commit

b478c0f

1 Parent(s): 9ecc54e

Upload to debug

Browse files

Files changed (1) hide show

app.py +8 -130

app.py CHANGED Viewed

@@ -75,13 +75,6 @@ import spaces
 # Load model once globally
 #ckpt_path = "ckpts/ep215_0906.oat.ckpt"
 #model = AVSEModule.load_from_checkpoint(ckpt_path)
-avse_model = AVSEModule()
-#avse_state_dict = torch.load("ckpts/ep215_0906.oat.ckpt")
-avse_state_dict = torch.load("ckpts/ep220_0908.oat.ckpt")
-avse_model.load_state_dict(avse_state_dict, strict=True)
-avse_model.to("cuda")
-avse_model.eval()
 CHUNK_SIZE_AUDIO = 2 * 48000  # 3 sec at 16kHz
 CHUNK_SIZE_VIDEO = 2 * 75     # 25fps × 3 sec
@@ -166,7 +159,15 @@ def extract_resampled_audio(video_path, target_sr=16000):
 def yolo_detection(frame, verbose=False):
     return model(frame, verbose=verbose)[0]
 def extract_faces(video_file):
     cap = cv2.VideoCapture(video_file)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
@@ -264,126 +265,3 @@ iface = gr.Interface(
 )
 iface.launch()
-ckpt = "ckpts/SEMamba_advanced.pth"
-cfg_f = "recipes/SEMamba_advanced.yaml"
-# load config
-with open(cfg_f, 'r') as f:
-    cfg = yaml.safe_load(f)
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-device = "cuda"
-model  = SEMamba(cfg).to(device)
-#sdict  = torch.load(ckpt, map_location=device)
-#model.load_state_dict(sdict["generator"])
-#model.eval()
-@spaces.GPU
-def enhance(filepath, model_name):
-    # Load model based on selection
-    ckpt_path = {
-        "VCTK-Demand": "ckpts/SEMamba_advanced.pth",
-        "VCTK+DNS": "ckpts/vd.pth"
-    }[model_name]
-    print("Loading:", ckpt_path)
-    model.load_state_dict(torch.load(ckpt_path, map_location=device)["generator"])
-    model.eval()
-    with torch.no_grad():
-        # load & resample
-        wav, orig_sr = librosa.load(filepath, sr=None)
-        noisy_wav = wav.copy()
-        if orig_sr != 16000:
-            wav = librosa.resample(wav, orig_sr=orig_sr, target_sr=16000)
-        x = torch.from_numpy(wav).float().to(device)
-        norm = torch.sqrt(len(x)/torch.sum(x**2))
-        #x = (x * norm).unsqueeze(0)
-        x = (x * norm)
-        # split into 4s segments (64000 samples)
-        segment_len = 4 * 16000
-        chunks = x.split(segment_len)
-        enhanced_chunks = []
-        for chunk in chunks:
-            if len(chunk) < segment_len:
-                #pad = torch.zeros(segment_len - len(chunk), device=chunk.device)
-                pad = (torch.randn(segment_len - len(chunk), device=chunk.device) * 1e-4)
-                chunk = torch.cat([chunk, pad])
-            chunk = chunk.unsqueeze(0)
-            amp, pha, _ = mag_phase_stft(chunk, 400, 100, 400, 0.3)
-            amp2, pha2, _ = model(amp, pha)
-            out = mag_phase_istft(amp2, pha2, 400, 100, 400, 0.3)
-            out = (out / norm).squeeze(0)
-            enhanced_chunks.append(out)
-        out = torch.cat(enhanced_chunks)[:len(x)].cpu().numpy()  # trim padding
-        # back to original rate
-        if orig_sr != 16000:
-            out = librosa.resample(out, orig_sr=16000, target_sr=orig_sr)
-        # Normalize
-        peak = np.max(np.abs(out))
-        if peak > 0.05:
-            out = out / peak * 0.85
-        # write file
-        sf.write("enhanced.wav", out, orig_sr)
-        # spectrograms
-        fig, axs = plt.subplots(1, 2, figsize=(16, 4))
-        # noisy
-        D_noisy = librosa.stft(noisy_wav, n_fft=512, hop_length=256)
-        S_noisy = librosa.amplitude_to_db(np.abs(D_noisy), ref=np.max)
-        librosa.display.specshow(S_noisy, sr=orig_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[0], vmax=0)
-        axs[0].set_title("Noisy Spectrogram")
-        # enhanced
-        D_clean = librosa.stft(out, n_fft=512, hop_length=256)
-        S_clean = librosa.amplitude_to_db(np.abs(D_clean), ref=np.max)
-        librosa.display.specshow(S_clean, sr=orig_sr, hop_length=256, x_axis="time", y_axis="hz", ax=axs[1], vmax=0)
-        #librosa.display.specshow(S_clean, sr=16000, hop_length=512, x_axis="time", y_axis="hz", ax=axs[1], vmax=0)
-        axs[1].set_title("Enhanced Spectrogram")
-        plt.tight_layout()
-    return "enhanced.wav", fig
-#with gr.Blocks() as demo:
-#    gr.Markdown(ABOUT)
-#    input_audio = gr.Audio(label="Input Audio", type="filepath", interactive=True)
-#    enhance_btn = gr.Button("Enhance")
-#    output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
-#    plot_output = gr.Plot(label="Spectrograms")
-#
-#    enhance_btn.click(fn=enhance, inputs=input_audio, outputs=[output_audio, plot_output])
-#
-#demo.queue().launch()
-with gr.Blocks() as demo:
-    gr.Markdown(ABOUT)
-    input_audio = gr.Audio(label="Input Audio", type="filepath", interactive=True)
-    model_choice = gr.Radio(
-        label="Choose Model (The use of VCTK+DNS is recommended)",
-        choices=["VCTK-Demand", "VCTK+DNS"],
-        value="VCTK-Demand"
-    )
-    enhance_btn = gr.Button("Enhance")
-    output_audio = gr.Audio(label="Enhanced Audio", type="filepath")
-    plot_output = gr.Plot(label="Spectrograms")
-    enhance_btn.click(
-        fn=enhance,
-        inputs=[input_audio, model_choice],
-        outputs=[output_audio, plot_output]
-    )
-    gr.Markdown("**Note**: The current models are trained on 16kHz audio. Therefore, any input audio not sampled at 16kHz will be automatically resampled before enhancement.")
-demo.queue().launch()

 # Load model once globally
 #ckpt_path = "ckpts/ep215_0906.oat.ckpt"
 #model = AVSEModule.load_from_checkpoint(ckpt_path)
 CHUNK_SIZE_AUDIO = 2 * 48000  # 3 sec at 16kHz
 CHUNK_SIZE_VIDEO = 2 * 75     # 25fps × 3 sec
 def yolo_detection(frame, verbose=False):
     return model(frame, verbose=verbose)[0]
+@spaces.GPU
 def extract_faces(video_file):
+    avse_model = AVSEModule()
+    #avse_state_dict = torch.load("ckpts/ep215_0906.oat.ckpt")
+    avse_state_dict = torch.load("ckpts/ep220_0908.oat.ckpt")
+    avse_model.load_state_dict(avse_state_dict, strict=True)
+    avse_model.to("cuda")
+    avse_model.eval()
     cap = cv2.VideoCapture(video_file)
     fps = cap.get(cv2.CAP_PROP_FPS)
     frames = []
 )
 iface.launch()