File size: 4,102 Bytes
d385f3b
c4c4663
d385f3b
 
c4c4663
d661acb
d385f3b
 
 
c4c4663
0d0ce41
d385f3b
c4c4663
 
 
d385f3b
c4c4663
 
 
 
 
 
 
 
d385f3b
 
 
 
c4c4663
 
 
 
 
 
d385f3b
c4c4663
 
 
d385f3b
c4c4663
 
 
d385f3b
 
 
c4c4663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d661acb
 
c4c4663
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d385f3b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import os
import uuid
import torch
import torchaudio
import torchaudio.transforms as T
import soundfile as sf
import gradio as gr
import look2hear.models

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models
dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache")
dnr_model.to(device).eval()

sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache")
sep_model.to(device).eval()

TARGET_SR = 16000
MAX_SPEAKERS = 4

# --- DnR Function ---
def separate_dnr(audio_file):
    audio, sr = torchaudio.load(audio_file)
    audio = audio.to(device)

    with torch.no_grad():
        dialog, effect, music = dnr_model(audio[None])

    # Unique output folder
    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_dnr", session_id)
    os.makedirs(output_dir, exist_ok=True)

    dialog_path = os.path.join(output_dir, "dialog.wav")
    effect_path = os.path.join(output_dir, "effect.wav")
    music_path = os.path.join(output_dir, "music.wav")

    torchaudio.save(dialog_path, dialog.cpu(), sr)
    torchaudio.save(effect_path, effect.cpu(), sr)
    torchaudio.save(music_path, music.cpu(), sr)

    return dialog_path, effect_path, music_path

# --- Speaker Separation Function ---
def separate_speakers(audio_path):
    waveform, original_sr = torchaudio.load(audio_path)
    if original_sr != TARGET_SR:
        waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform)

    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0)
    audio_input = waveform.unsqueeze(0).to(device)

    with torch.no_grad():
        ests_speech = sep_model(audio_input)

    ests_speech = ests_speech.squeeze(0)

    # Unique output folder
    session_id = uuid.uuid4().hex[:8]
    output_dir = os.path.join("output_sep", session_id)
    os.makedirs(output_dir, exist_ok=True)

    output_files = []
    for i in range(ests_speech.shape[0]):
        path = os.path.join(output_dir, f"speaker_{i+1}.wav")
        audio_np = ests_speech[i].cpu().numpy()
        sf.write(path, audio_np.T, TARGET_SR)  # Transpose only if shape is [T, C], usually not needed
        output_files.append(path)

    updates = []
    for i in range(MAX_SPEAKERS):
        if i < len(output_files):
            updates.append(gr.update(value=output_files[i], visible=True, label=f"Speaker {i+1}"))
        else:
            updates.append(gr.update(value=None, visible=False))
    return updates

# --- Gradio App ---
with gr.Blocks() as demo:
    gr.Markdown("# Look2Hear Audio Processing Toolkit")

    with gr.Tabs():
        # --- Tab 1: DnR ---
        with gr.Tab("Dialog/Effects/Music Separation (DnR)"):
            gr.Markdown("### Separate Dialog, Effects, and Music from Mixed Audio")

            dnr_input = gr.Audio(type="filepath", label="Upload Audio File")
            dnr_button = gr.Button("Separate Audio")

            dnr_output_dialog = gr.Audio(label="Dialog", type="filepath")
            dnr_output_effect = gr.Audio(label="Effects", type="filepath")
            dnr_output_music = gr.Audio(label="Music", type="filepath")

            dnr_button.click(
                fn=separate_dnr,
                inputs=dnr_input,
                outputs=[dnr_output_dialog, dnr_output_effect, dnr_output_music]
            )

        # --- Tab 2: Speaker Separation ---
        with gr.Tab("Speaker Separation"):
            gr.Markdown("### Separate Individual Speakers from Mixed Speech")

            sep_input = gr.Audio(type="filepath", label="Upload Speech Audio")
            sep_button = gr.Button("Separate Speakers")

            gr.Markdown("#### Separated Speakers")
            sep_outputs = []
            for i in range(MAX_SPEAKERS):
                sep_outputs.append(gr.Audio(label=f"Speaker {i+1}", visible=(i == 0), interactive=False))

            sep_button.click(
                fn=separate_speakers,
                inputs=sep_input,
                outputs=sep_outputs
            )

if __name__ == "__main__":
    demo.launch()